# Zero-shot inference on augmented and preprocessed Pap and Pep test sets.

In [37]:
import json
import random
import numpy as np
import torch
# from torch.utils.data import DataLoader
from datasets import load_from_disk
from transformers import pipeline
from transformers import AutoTokenizer, RobertaForSequenceClassification, RobertaConfig
import evaluate
from evaluate import evaluator
from tqdm import tqdm

In [2]:
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(0)

In [3]:
PATH = '/mount/studenten/arbeitsdaten-studenten1/semantic-plausibility/plausible-parrots/'
CACHE_DIR = PATH + 'cache/'
DATA_PATH = PATH + '2-baselines/dataset_construction/output/'

In [4]:
device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
device_name = torch.cuda.get_device_name()
print(f"Using device: {device} ({device_name})")

Using device: cuda:0 (NVIDIA RTX A6000)


In [5]:
# Load Pap and Pep test sets.
dataset = load_from_disk(DATA_PATH + 'testsets_baseline_raw_text/')
dataset = dataset.remove_columns('id')
dataset = dataset.with_format('torch')
dataset

DatasetDict({
    pap: Dataset({
        features: ['text', 'labels'],
        num_rows: 308
    })
    pep: Dataset({
        features: ['text', 'labels'],
        num_rows: 307
    })
})

In [6]:
pap_test = dataset['pap']
pap_test

Dataset({
    features: ['text', 'labels'],
    num_rows: 308
})

In [7]:
pep_test = dataset['pep']
pep_test

Dataset({
    features: ['text', 'labels'],
    num_rows: 307
})

In [8]:
model_name = 'roberta-large'
# model_name = 'roberta-large-mnli'

In [9]:
# Load tokenizer and model.
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)

config_roberta_large = RobertaConfig.from_pretrained(
    model_name,
    id2label={0: 'implausible', 1: 'plausible'},
    label2id={'implausible': 0, 'plausible': 1},
)

config_roberta_large_mnli = RobertaConfig.from_pretrained(
    model_name, 
    num_labels=2, 
    id2label={0: 'implausible', 1: 'plausible'},
    label2id={'implausible': 0, 'plausible': 1},
    out_proj = torch.nn.Linear(in_features=1024, out_features=2, bias=True),
)

if model_name == 'roberta-large':
    model_config = config_roberta_large
    print("Model config: roberta-large")

if model_name == 'roberta-large-mnli':
    model_config = config_roberta_large_mnli
    print("Model config: roberta-large-mnli")


model = RobertaForSequenceClassification.from_pretrained(
    model_name, 
    config=model_config, 
    cache_dir=CACHE_DIR, 
    ignore_mismatched_sizes=True
)
# model.classifier = torch.nn.Linear(model.config.hidden_size, 2)
# model.classifier.out_proj = torch.nn.Linear(in_features=1024, out_features=2, bias=True)
# model.config.num_labels = 2
# model.config.id2label = {0: 'implausible', 1: 'plausible'}
# model.config.label2id = {'implausible': 0, 'plausible': 1}
model = model.to(device)
model.eval()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model config: roberta-large


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [10]:
print(model.config.num_labels)
print(model.config.id2label)
print(model.config.label2id)

2
{0: 'implausible', 1: 'plausible'}
{'implausible': 0, 'plausible': 1}


In [11]:
classifier = pipeline('zero-shot-classification', model=model, tokenizer=tokenizer, device=device)
classifier

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


<transformers.pipelines.zero_shot_classification.ZeroShotClassificationPipeline at 0x7f6f9e8cf280>

In [12]:
def predict(classifier, data, candidate_labels):
    preds = classifier(data, candidate_labels)
#     print("DATA:", data)
    y_preds = []
    y_preds_str = []
    for pred in preds:
        id_pred = np.argmax(pred['scores'])
        y_pred = pred['labels'][id_pred]
        y_pred_str = model.config.id2label[y_pred]
        y_preds.append(y_pred)
        y_preds_str.append(y_pred_str)
        
    result = {
        'y_preds': y_preds, 
        'label_preds': y_preds_str,
    }
    return result

## Run inference.

In [35]:
candidate_labels = [0, 1]

softmax = torch.nn.Softmax(dim=1)
softmax

Softmax(dim=1)

### Pap

In [14]:
data_pap = pap_test['text']
result_pap = predict(classifier, data_pap, candidate_labels)
result_pap

{'y_preds': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,

In [15]:
y_preds_pap = result_pap['y_preds']
label_preds_pap = result_pap['label_preds']

In [16]:
from collections import Counter
print(Counter(y_preds_pap))
print(Counter(label_preds_pap))

Counter({0: 230, 1: 78})
Counter({'implausible': 230, 'plausible': 78})


In [17]:
y_true_pap = pap_test['labels'].tolist()
y_true_pap

[1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [18]:
# Load evaluation metrics.
auc = evaluate.load('roc_auc')
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')

In [19]:
auc_score_pap = auc.compute(references=y_true_pap, prediction_scores=y_preds_pap)
print(auc_score_pap)

{'roc_auc': 0.5324675324675324}


In [20]:
accuracy_score_pap = accuracy.compute(references=y_true_pap, predictions=y_preds_pap)
print(accuracy_score_pap)

{'accuracy': 0.5324675324675324}


In [21]:
precision_score_pap = precision.compute(references=y_true_pap, predictions=y_preds_pap)
print(precision_score_pap)

{'precision': 0.5641025641025641}


In [22]:
recall_score_pap = recall.compute(references=y_true_pap, predictions=y_preds_pap)
print(recall_score_pap)

{'recall': 0.2857142857142857}


In [23]:
f1_score_pap = f1.compute(references=y_true_pap, predictions=y_preds_pap)
f1_score_pap

{'f1': 0.3793103448275862}

In [41]:
with open('./evaluation_outputs/baseline_zeroshot_pap.json', 'w') as file:
    json.dump({'y_pred': y_preds_pap, 
               'y_true': y_true_pap, 
               'metrics':{'test_auc': auc_score_pap['roc_auc'], 
                          'test_precision': precision_score_pap['precision'], 
                          'test_recall': recall_score_pap['recall'], 
                          'test_f1': f1_score_pap['f1'], 
                          'test_accuracy': accuracy_score_pap['accuracy'],
                         }
              }, file)

### Pep

In [24]:
data_pep = pep_test['text']
result_pep = predict(classifier, data_pep, candidate_labels)
result_pep

{'y_preds': [1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,

In [25]:
y_preds_pep = result_pep['y_preds']
label_preds_pep = result_pep['label_preds']

In [26]:
print(Counter(y_preds_pep))
print(Counter(label_preds_pep))

Counter({0: 265, 1: 42})
Counter({'implausible': 265, 'plausible': 42})


In [27]:
y_true_pep = pep_test['labels'].tolist()
y_true_pep

[1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,


In [28]:
auc_score_pep = auc.compute(references=y_true_pep, prediction_scores=y_preds_pep)
print(auc_score_pep)

{'roc_auc': 0.500445632798574}


In [29]:
accuracy_score_pep = accuracy.compute(references=y_true_pep, predictions=y_preds_pep)
print(accuracy_score_pep)

{'accuracy': 0.501628664495114}


In [30]:
precision_score_pep = precision.compute(references=y_true_pep, predictions=y_preds_pep)
print(precision_score_pep)

{'precision': 0.5}


In [31]:
recall_score_pep = recall.compute(references=y_true_pep, predictions=y_preds_pep)
print(recall_score_pep)

{'recall': 0.13725490196078433}


In [32]:
f1_score_pep = f1.compute(references=y_true_pep, predictions=y_preds_pep)
f1_score_pep

{'f1': 0.21538461538461542}

In [40]:
with open('./evaluation_outputs/baseline_zeroshot_pep.json', 'w') as file:
    json.dump({'y_pred': y_preds_pep, 
               'y_true': y_true_pep, 
               'metrics':{'test_auc': auc_score_pep['roc_auc'], 
                          'test_precision': precision_score_pep['precision'], 
                          'test_recall': recall_score_pep['recall'], 
                          'test_f1': f1_score_pep['f1'], 
                          'test_accuracy': accuracy_score_pep['accuracy'],
                         }
              }, file)