In [37]:
import numpy as np

import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import GlueDataset, default_data_collator

In [3]:
MODEL_PATH_SMALL = '/home/nlp/experiments/big_small/small/'
MODEL_PATH_BIG = '/home/nlp/experiments/big_small/big/'

In [75]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH_BIG)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH_BIG)

In [7]:
data_args = DataTrainingArguments(task_name = 'MNLI', data_dir = '/home/nlp/data/glue_data/MNLI')

In [9]:
eval_dataset = GlueDataset(data_args, tokenizer, mode="dev")

In [69]:
eval_dl = DataLoader(
            eval_dataset,
            collate_fn = default_data_collator,
            batch_size=128
            )

In [70]:
batch = next(iter(eval_dl))

In [76]:
def put_on_cuda(model, batch):
    for k,v in batch.items():
        batch[k] = v.cuda()
    return model.cuda(), batch

In [77]:
model, batch = put_on_cuda(model, batch)

In [80]:
with torch.no_grad():
    model.eval()
    logits = model(**batch)[1].detach().cpu().numpy()
    label_ids = batch['labels'].cpu().numpy()
    preds = np.argmax(logits, axis=1)

In [114]:
len(eval_dataset)

9815

In [74]:
# small (array([ 10,  14,  15,  20,  24,  29,  31,  32,  38,  41,  47,  52,  59,
#         61,  65,  69,  71,  73,  75,  79,  80,  87,  97, 104, 110, 122]),)
    
# big (array([ 15,  23,  25,  38,  41,  47,  56,  59,  71,  73,  75,  79,  80,
#        104, 110]),)

(array([ 10,  14,  15,  20,  24,  29,  31,  32,  38,  41,  47,  52,  59,
         61,  65,  69,  71,  73,  75,  79,  80,  87,  97, 104, 110, 122]),)

In [82]:
np.where(label_ids != preds)

(array([ 15,  23,  25,  38,  41,  47,  56,  59,  71,  73,  75,  79,  80,
        104, 110]),)

In [112]:
label_ids

array([2, 0, 1, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 1, 2, 1, 1, 1, 2, 0, 2, 1,
       2, 2, 2, 0, 2, 0, 2, 2, 1, 1, 0, 1, 1, 2, 2, 1, 1, 0, 2, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 2, 0, 1, 2, 0, 2, 1, 0, 0, 2, 2, 1, 2, 0, 0, 2,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 2, 1, 2, 0, 0, 1, 1, 2, 0, 2, 0, 1, 2,
       1, 1, 2, 2, 0, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0, 1, 2, 0, 1])

In [113]:
torch.save({'samples': batch}, '/home/nlp/samples.pth')

In [111]:
tokenizer.decode(batch['input_ids'][122].tolist())

'[CLS] candidates must submit a set of fingerprints for review by the fbi. [SEP] people that want the job have to have their fingerprints sent to the fbi. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'