In [1]:
import json
import torch
import random
import numpy as np
from datasets import load_from_disk
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, RobertaForSequenceClassification, RobertaConfig
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

from typing import Optional, Union

import evaluate

In [2]:
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(0)

In [3]:
# Please replace the paths by your local paths.
PATH = '/mount/studenten/arbeitsdaten-studenten1/semantic-plausibility/plausible-parrots/'
CACHE_DIR = PATH + 'cache/'
DATA_PATH = PATH + '2-baselines/dataset_construction/output/'

In [4]:
device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
device_name = torch.cuda.get_device_name()
print(f"Using device: {device} ({device_name})")

Using device: cuda:0 (NVIDIA RTX A6000)


In [5]:
# Load Pap and Pep data sets.
dataset = load_from_disk(DATA_PATH + 'dataset_baseline/')
dataset = dataset.remove_columns('id')
dataset = dataset.with_format('torch')
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4911
    })
    dev: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 614
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 615
    })
})

In [6]:
# small_train_dataset = dataset["train"].shuffle(seed=42).select(range(10))
# small_dev_dataset = dataset["dev"].shuffle(seed=42).select(range(10))

In [7]:
model_name = 'roberta-large'
# model_name = 'roberta-large-mnli'

In [8]:
# Load tokenizer.
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)

In [9]:
# Speicify configuration for the model.
config_roberta_large = RobertaConfig.from_pretrained(
    model_name,
    id2label={0: 'implausible', 1: 'plausible'},
    label2id={'implausible': 0, 'plausible': 1},
)

config_roberta_large_mnli = RobertaConfig.from_pretrained(
    model_name, 
    num_labels=2, 
    id2label={0: 'implausible', 1: 'plausible'},
    label2id={'implausible': 0, 'plausible': 1},
#     out_proj = torch.nn.Linear(in_features=1024, out_features=2, bias=True),
)

if model_name == 'roberta-large':
    model_config = config_roberta_large
    print("Model config: roberta-large")

if model_name == 'roberta-large-mnli':
    model_config = config_roberta_large_mnli
    print("Model config: roberta-large-mnli")

Model config: roberta-large


In [10]:
# Load model.
model = RobertaForSequenceClassification.from_pretrained(
    model_name, 
    config=model_config, 
    cache_dir=CACHE_DIR, 
    ignore_mismatched_sizes=True,
)

model = model.to(device)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [11]:
print(model.config.num_labels)
print(model.config.id2label)
print(model.config.label2id)

2
{0: 'implausible', 1: 'plausible'}
{'implausible': 0, 'plausible': 1}


In [12]:
print(tokenizer.vocab_size)
tokenizer.special_tokens_map

50265


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

### Training

In [13]:
# Load evaluation metrics.
auc = evaluate.load('roc_auc')
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')

In [14]:
# Training hyperparameters
num_epochs = 10
batch_size = 8
optimizer = "adamw_torch"
lr = 1e-5  ### baseline_ft_1 1e-3 --> 1e-5 ###
weight_decay = 0.01
warmup_steps = 10

In [22]:
def compute_metrics(y_pred):
    """
    This code snippet is produced by ChatGPT.
    """
    
    logits, labels = y_pred
#     print("y_pred:", y_pred, type(y_pred))
#     print("logits:", logits, type(logits))
#     print("labels:", labels, type(labels))
    
    # Compute accuracy, precision, recall and F1-score.
    predictions = logits.argmax(axis=-1)
#     print("predictions:", predictions)
#     precision, recall, f1, num_each_label_in_y_true = precision_recall_fscore_support(labels, predictions, average='binary')
#     accuracy = accuracy_score(labels, predictions)
    accuracy_score = accuracy.compute(references=labels, predictions=predictions)
#     print("accuracy_score:", accuracy_score)
    
    precision_score = precision.compute(references=labels, predictions=predictions)
#     print("precision_score:", precision_score)
    
    recall_score = recall.compute(references=labels, predictions=predictions)
#     print("recall_score:", recall_score)
    
    f1_score = f1.compute(references=labels, predictions=predictions)
#     print("f1_score:", f1_score)
    
    # Compute AUC.
#     probs = logits[:, 1]
#     print("probs:", probs)
#     auc = roc_auc_score(labels, probs)
    auc_score = auc.compute(references=labels, prediction_scores=predictions)
#     print("auc_score:", auc_score)
    
    return {
        'auc': auc_score['roc_auc'],
        'precision': precision_score['precision'],
        'recall': recall_score['recall'],
        'f1': f1_score['f1'],
        'accuracy': accuracy_score['accuracy'],
    }

In [16]:
# Set early stopping.
earlystopping_callback = EarlyStoppingCallback(
    early_stopping_patience=5, 
    early_stopping_threshold=0.0,
)

In [17]:
training_args = TrainingArguments(
    output_dir='./output/baseline_ft_3/',
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    optim=optimizer,
    learning_rate=lr,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    num_train_epochs=num_epochs,
    logging_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='auc',
    greater_is_better=True,
    remove_unused_columns=False,
)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=True,
group_by_

In [16]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset['train'],
#     train_dataset=small_train_dataset,
    eval_dataset=dataset['dev'],
#     eval_dataset=small_dev_dataset,
    compute_metrics=compute_metrics,
    callbacks=[earlystopping_callback],
)
trainer

<transformers.trainer.Trainer at 0x7fd027727c10>

In [17]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Auc,Precision,Recall,F1,Accuracy
1,0.6994,0.675433,0.631922,0.591011,0.856678,0.699468,0.631922
2,0.6796,0.638461,0.677524,0.699634,0.62215,0.658621,0.677524
3,0.5995,0.563819,0.726384,0.776892,0.635179,0.698925,0.726384
4,0.5105,0.645407,0.724756,0.680628,0.846906,0.754717,0.724756
5,0.4353,0.77575,0.749186,0.728358,0.794788,0.760125,0.749186
6,0.3714,0.879594,0.736156,0.708934,0.801303,0.752294,0.736156
7,0.3451,0.956819,0.7443,0.713068,0.81759,0.76176,0.7443
8,0.3042,1.129153,0.742671,0.709859,0.820847,0.761329,0.742671
9,0.2805,1.225771,0.7443,0.714286,0.814332,0.761035,0.7443
10,0.2577,1.302887,0.745928,0.722714,0.798046,0.758514,0.745928


TrainOutput(global_step=6140, training_loss=0.44831948544381106, metrics={'train_runtime': 4136.1051, 'train_samples_per_second': 11.873, 'train_steps_per_second': 1.484, 'total_flos': 4.576714925303808e+16, 'train_loss': 0.44831948544381106, 'epoch': 10.0})

In [18]:
model.save_pretrained('./final_ckpt/baseline_ft_3/')
tokenizer.save_pretrained('./final_ckpt/baseline_ft_3/')

('./final_ckpt/baseline_ft_3/tokenizer_config.json',
 './final_ckpt/baseline_ft_3/special_tokens_map.json',
 './final_ckpt/baseline_ft_3/vocab.json',
 './final_ckpt/baseline_ft_3/merges.txt',
 './final_ckpt/baseline_ft_3/added_tokens.json',
 './final_ckpt/baseline_ft_3/tokenizer.json')

### Evaluation

In [18]:
# Load test set.
testset = load_from_disk(DATA_PATH + 'testsets_baseline/')
testset = testset.remove_columns('id')
testset

DatasetDict({
    pap: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 308
    })
    pep: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 307
    })
})

In [19]:
pap_test = testset['pap']
pap_test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 308
})

In [20]:
pep_test = testset['pep']
pep_test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 307
})

In [22]:
# Predict fine-tuned model on pap_test.
preds_pap_test = trainer.predict(pap_test)
preds_pap_test

PredictionOutput(predictions=array([[-0.34985483,  1.1360917 ],
       [ 0.41579422, -0.7566971 ],
       [ 0.3384913 ,  0.22525164],
       [-0.05779028, -0.01394137],
       [ 1.6555184 , -2.2092965 ],
       [-1.8386679 ,  1.7207891 ],
       [ 1.7372831 , -2.1690643 ],
       [ 2.4905803 , -2.7112844 ],
       [-1.529915  ,  2.1498437 ],
       [-1.9498034 ,  1.8533946 ],
       [ 0.43253866, -1.0078654 ],
       [-2.241642  ,  2.264311  ],
       [-2.0830753 ,  1.9846301 ],
       [ 0.17329392, -0.07328358],
       [ 0.62675005, -0.4230287 ],
       [-1.1257495 ,  1.6731563 ],
       [-2.1361816 ,  2.22084   ],
       [-0.08894154,  0.4272504 ],
       [-2.1250677 ,  1.8600863 ],
       [ 2.7100286 , -2.9545913 ],
       [-2.2712438 ,  2.2593524 ],
       [-0.08125896,  1.01189   ],
       [ 1.5058784 , -2.0192907 ],
       [-1.9300748 ,  2.0302916 ],
       [ 0.19922906, -0.43508974],
       [-2.2635744 ,  2.310764  ],
       [-2.1302738 ,  2.3280349 ],
       [-2.2657037 ,  2.24

In [23]:
# Evaluate fine-tuned model on pap_test.
scores_pap_test = trainer.evaluate(pap_test)
scores_pap_test

{'eval_loss': 0.9914360642433167,
 'eval_auc': 0.6396103896103896,
 'eval_precision': 0.6303030303030303,
 'eval_recall': 0.6753246753246753,
 'eval_f1': 0.652037617554859,
 'eval_accuracy': 0.6396103896103896,
 'eval_runtime': 7.5423,
 'eval_samples_per_second': 40.836,
 'eval_steps_per_second': 5.171,
 'epoch': 10.0}

In [24]:
# Predict fine-tuned model on pep_test.
preds_pep_test = trainer.predict(pep_test)
preds_pep_test

PredictionOutput(predictions=array([[-1.4198551 ,  1.6326545 ],
       [-0.18594988,  0.57252747],
       [-2.2382617 ,  2.2772908 ],
       [-2.2359278 ,  2.2230885 ],
       [-2.1967156 ,  2.305694  ],
       [-2.0525832 ,  2.1853905 ],
       [ 2.9055624 , -2.9034195 ],
       [ 1.3304565 , -2.0031247 ],
       [ 2.7066517 , -2.856166  ],
       [-2.2092886 ,  2.2135134 ],
       [ 1.39055   , -1.9760916 ],
       [-2.1496823 ,  2.20287   ],
       [ 2.3069942 , -2.4569337 ],
       [ 2.6160784 , -2.774059  ],
       [ 1.2400185 , -1.7341622 ],
       [ 2.4490151 , -2.731315  ],
       [ 2.3472598 , -2.609867  ],
       [-2.1951432 ,  2.0887702 ],
       [ 0.6037482 ,  0.275419  ],
       [ 2.6821    , -2.9020214 ],
       [-2.230907  ,  2.3979926 ],
       [-2.2089837 ,  2.3175573 ],
       [-1.3304327 ,  1.9353191 ],
       [-2.3030136 ,  2.2745068 ],
       [-2.1966686 ,  2.1754613 ],
       [ 0.5439906 , -0.4476069 ],
       [-1.9468411 ,  1.8496375 ],
       [ 2.128251  , -2.56

In [25]:
# Evaluate fine-tuned model on pep_test.
scores_pep_test = trainer.evaluate(pep_test)
scores_pep_test

{'eval_loss': 0.6852611899375916,
 'eval_auc': 0.7983617689500042,
 'eval_precision': 0.7486338797814208,
 'eval_recall': 0.8954248366013072,
 'eval_f1': 0.8154761904761906,
 'eval_accuracy': 0.7980456026058632,
 'eval_runtime': 7.5412,
 'eval_samples_per_second': 40.71,
 'eval_steps_per_second': 5.172,
 'epoch': 10.0}

In [34]:
softmax = torch.nn.Softmax(dim=1)
softmax

Softmax(dim=1)

In [27]:
probs_pap_test = softmax(torch.tensor(preds_pap_test.predictions)).tolist()

y_pred_pap_test = np.argmax(preds_pap_test.predictions, axis=-1).tolist()

metrics_pap_test = preds_pap_test.metrics
metrics_pap_test['test_loss'] = scores_pap_test['eval_loss']
metrics_pap_test['test_runtime'] = scores_pap_test['eval_runtime']
metrics_pap_test['test_samples_per_second'] = scores_pap_test['eval_samples_per_second']
metrics_pap_test['test_steps_per_second'] = scores_pap_test['eval_steps_per_second']

with open('./evaluation_outputs/baseline_ft_3/preds_pap.json', 'w') as file:
    json.dump({'probabilities': probs_pap_test, 
               'y_pred': y_pred_pap_test, 
               'y_true': preds_pap_test.label_ids.tolist(), 
               'metrics': metrics_pap_test,
              }, file)

In [28]:
probs_pep_test = softmax(torch.tensor(preds_pep_test.predictions)).tolist()

y_pred_pep_test = np.argmax(preds_pep_test.predictions, axis=-1).tolist()

metrics_pep_test = preds_pep_test.metrics
metrics_pep_test['test_loss'] = scores_pep_test['eval_loss']
metrics_pep_test['test_runtime'] = scores_pep_test['eval_runtime']
metrics_pep_test['test_samples_per_second'] = scores_pep_test['eval_samples_per_second']
metrics_pep_test['test_steps_per_second'] = scores_pep_test['eval_steps_per_second']

with open('./evaluation_outputs/baseline_ft_3/preds_pep.json', 'w') as file:
    json.dump({'probabilities': probs_pep_test, 
               'y_pred': y_pred_pep_test, 
               'y_true': preds_pep_test.label_ids.tolist(), 
               'metrics': metrics_pep_test,
              }, file)

### Load model checkpoint with lowest validation loss.
In the training_args above, I set metric_for_best_model='auc'. This is suboptimal since the model checkpoint, for which the validation loss is the lowest and is about to increase, should be considered the best model checkpoint.

In [8]:
tokenizer_best = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path='./output/baseline_ft_3/checkpoint-1842/',
    cache_dir=CACHE_DIR,
)

In [13]:
model_best = RobertaForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path='./output/baseline_ft_3/checkpoint-1842/',
    config=model_config, 
    cache_dir=CACHE_DIR, 
    ignore_mismatched_sizes=True,
)
model_best = model_best.to(device)
model_best.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [37]:
trainer_eval = Trainer(
    model=model_best,
    tokenizer=tokenizer_best,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['dev'],
    compute_metrics=compute_metrics,
    callbacks=[earlystopping_callback],
)
trainer_eval

<transformers.trainer.Trainer at 0x7f3fef702170>

In [31]:
# Predict the best checkpoint (the one with the highest auc) on pap_test.
preds_pap_best_model = trainer_eval_pap.predict(pap_test)
preds_pap_best_model

PredictionOutput(predictions=array([[ 5.10049939e-01,  8.25089142e-02],
       [ 6.54687464e-01, -5.56243539e-01],
       [ 2.75538892e-01,  4.60056514e-01],
       [ 4.69344944e-01,  6.00758679e-02],
       [ 3.90118152e-01, -1.02157462e+00],
       [ 2.38013029e-01,  5.06133556e-01],
       [ 3.81414175e-01, -8.77464771e-01],
       [ 6.09385908e-01, -8.99545252e-01],
       [ 4.17030901e-01,  1.82039350e-01],
       [-2.11669639e-01,  9.33946908e-01],
       [ 5.58951139e-01, -1.40453309e-01],
       [ 2.85343975e-01,  4.63832527e-01],
       [-1.92025840e-01,  8.09980750e-01],
       [ 6.67983294e-01, -2.86123246e-01],
       [ 5.10050237e-01,  1.91205725e-01],
       [ 4.68902916e-01, -3.17473680e-01],
       [-5.17617154e-04,  7.18436956e-01],
       [ 5.65206885e-01, -6.27544641e-01],
       [ 2.74542421e-01,  4.08847421e-01],
       [ 7.80868649e-01, -1.51473498e+00],
       [-1.93980798e-01,  8.85778189e-01],
       [ 7.85100043e-01, -1.21270053e-01],
       [ 6.98413491e-01, 

In [39]:
# Evaluate the best checkpoint (the one with the highest auc) on pap_test.
scores_pap_best_model = trainer_eval.evaluate(pap_test)
scores_pap_best_model

{'eval_loss': 0.6401085257530212,
 'eval_auc': 0.6461038961038961,
 'eval_precision': 0.7368421052631579,
 'eval_recall': 0.45454545454545453,
 'eval_f1': 0.5622489959839357,
 'eval_accuracy': 0.6461038961038961,
 'eval_runtime': 7.4955,
 'eval_samples_per_second': 41.091,
 'eval_steps_per_second': 5.203}

In [40]:
probs_pap_test_best_model = softmax(torch.tensor(preds_pap_best_model.predictions)).tolist()

y_pred_pap_test_best_model = np.argmax(preds_pap_best_model.predictions, axis=-1).tolist()

metrics_pap_test_best_model = preds_pap_best_model.metrics
metrics_pap_test_best_model['test_loss'] = scores_pap_best_model['eval_loss']
metrics_pap_test_best_model['test_runtime'] = scores_pap_best_model['eval_runtime']
metrics_pap_test_best_model['test_samples_per_second'] = scores_pap_best_model['eval_samples_per_second']
metrics_pap_test_best_model['test_steps_per_second'] = scores_pap_best_model['eval_steps_per_second']

with open('./evaluation_outputs/baseline_ft_3/preds_pap_best_ckpt.json', 'w') as file:
    json.dump({'probabilities': probs_pap_test_best_model, 
               'y_pred': y_pred_pap_test_best_model, 
               'y_true': preds_pap_best_model.label_ids.tolist(), 
               'metrics': metrics_pap_test_best_model,
              }, file)

In [41]:
# Predict the best checkpoint (the one with the highest auc) on pep_test.
preds_pep_best_model = trainer_eval.predict(pep_test)
preds_pep_best_model

PredictionOutput(predictions=array([[ 4.75900799e-01, -1.34866023e+00],
       [ 3.36900860e-01,  2.33833134e-01],
       [ 8.72264877e-02,  6.60675049e-01],
       [ 6.43829927e-02,  7.03300357e-01],
       [ 8.02958548e-01, -1.19330704e+00],
       [ 1.14785306e-01,  6.96579695e-01],
       [ 1.38386214e+00, -1.74632955e+00],
       [ 3.35738540e-01, -8.28276098e-01],
       [ 1.20520484e+00, -1.81238294e+00],
       [ 3.43557835e-01,  2.01447874e-01],
       [ 2.99869984e-01, -1.05061519e+00],
       [-6.32707030e-02,  8.05309653e-01],
       [ 8.45761418e-01, -1.04105330e+00],
       [ 7.04152107e-01, -1.37521505e+00],
       [ 6.04030013e-01, -4.09976132e-02],
       [ 7.16398418e-01, -1.65545213e+00],
       [ 1.25809208e-01, -6.18294477e-01],
       [ 1.24173574e-01,  6.36492133e-01],
       [ 4.01664615e-01,  4.53308940e-01],
       [ 9.92375135e-01, -1.66384399e+00],
       [-2.07765445e-01,  9.04037118e-01],
       [ 1.49493113e-01,  4.87894654e-01],
       [ 4.45665777e-01, 

In [44]:
# Evaluate the best checkpoint (the one with the highest auc) on pep_test.
scores_pep_best_model = trainer_eval.evaluate(pep_test)
scores_pep_best_model

{'eval_loss': 0.49421536922454834,
 'eval_auc': 0.7913377472201001,
 'eval_precision': 0.8296296296296296,
 'eval_recall': 0.7320261437908496,
 'eval_f1': 0.7777777777777777,
 'eval_accuracy': 0.7915309446254072,
 'eval_runtime': 7.4576,
 'eval_samples_per_second': 41.166,
 'eval_steps_per_second': 5.23}

In [45]:
probs_pep_test_best_model = softmax(torch.tensor(preds_pep_best_model.predictions)).tolist()

y_pred_pep_test_best_model = np.argmax(preds_pep_best_model.predictions, axis=-1).tolist()

metrics_pep_test_best_model = preds_pep_best_model.metrics
metrics_pep_test_best_model['test_loss'] = scores_pep_best_model['eval_loss']
metrics_pep_test_best_model['test_runtime'] = scores_pep_best_model['eval_runtime']
metrics_pep_test_best_model['test_samples_per_second'] = scores_pep_best_model['eval_samples_per_second']
metrics_pep_test_best_model['test_steps_per_second'] = scores_pep_best_model['eval_steps_per_second']

with open('./evaluation_outputs/baseline_ft_3/preds_pep_best_ckpt.json', 'w') as file:
    json.dump({'probabilities': probs_pep_test_best_model, 
               'y_pred': y_pred_pep_test_best_model, 
               'y_true': preds_pep_best_model.label_ids.tolist(), 
               'metrics': metrics_pep_test_best_model,
              }, file)