In [None]:
# init model and dataset for testing and analysis
# code taken from run.py

import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
    AutoModelForQuestionAnswering, Trainer, TrainingArguments, HfArgumentParser
from helpers import prepare_dataset_nli, prepare_train_dataset_qa, \
    prepare_validation_dataset_qa, QuestionAnsweringTrainer, compute_accuracy
import os
import json
NUM_PREPROCESSING_WORKERS = 1

argp = HfArgumentParser(TrainingArguments)

argp.add_argument('--model', type=str,
                    default='google/electra-small-discriminator',
                    help="""This argument specifies the base model to fine-tune.
    This should either be a HuggingFace model ID (see https://huggingface.co/models)
    or a path to a saved model checkpoint (a folder containing config.json and pytorch_model.bin).""")
argp.add_argument('--task', type=str, choices=['nli', 'qa'], required=True,
                    help="""This argument specifies which task to train/evaluate on.
    Pass "nli" for natural language inference or "qa" for question answering.
    By default, "nli" will use the SNLI dataset, and "qa" will use the SQuAD dataset.""")
argp.add_argument('--dataset', type=str, default=None,
                    help="""This argument overrides the default dataset used for the specified task.""")
argp.add_argument('--max_length', type=int, default=128,
                    help="""This argument limits the maximum sequence length used during training/evaluation.
    Shorter sequence lengths need less memory and computation time, but some examples may end up getting truncated.""")
argp.add_argument('--max_train_samples', type=int, default=None,
                    help='Limit the number of examples to train on.')
argp.add_argument('--max_eval_samples', type=int, default=None,
                    help='Limit the number of examples to evaluate on.')

# reset defaults
args_dataset = 'eval_output_colab/eval_predictions.jsonl'
args_task = 'qa'
args_model = './trained_model/'
args_max_length = 128
training_args_do_eval = True
training_args_do_train = False
args_max_train_samples = None
args_max_eval_samples = None

# this line was trowing an error when executed on this notebook
# training_args, args = argp.parse_args_into_dataclasses()

# Dataset selection
if args_dataset.endswith('.json') or args_dataset.endswith('.jsonl'):
    dataset_id = None
    # Load from local json/jsonl file
    dataset = datasets.load_dataset('json', data_files=args_dataset)
    # By default, the "json" dataset loader places all examples in the train split,
    # so if we want to use a jsonl file for evaluation we need to get the "train" split
    # from the loaded dataset
    eval_split = 'train'
else:
    default_datasets = {'qa': ('squad',), 'nli': ('snli',)}
    dataset_id = tuple(args_dataset.split(':')) if args_dataset is not None else \
        default_datasets[args_task]
    # MNLI has two validation splits (one with matched domains and one with mismatched domains). Most datasets just have one "validation" split
    eval_split = 'validation_matched' if dataset_id == ('glue', 'mnli') else 'validation'
    # Load the raw data
    dataset = datasets.load_dataset(*dataset_id)

# NLI models need to have the output label count specified (label 0 is "entailed", 1 is "neutral", and 2 is "contradiction")
task_kwargs = {'num_labels': 3} if args_task == 'nli' else {}

# Here we select the right model fine-tuning head
model_classes = {'qa': AutoModelForQuestionAnswering,
                    'nli': AutoModelForSequenceClassification}
model_class = model_classes[args_task]
# Initialize the model and tokenizer from the specified pretrained model/checkpoint
model = model_class.from_pretrained(args_model, **task_kwargs)
tokenizer = AutoTokenizer.from_pretrained(args_model, use_fast=True)

# Select the dataset preprocessing function (these functions are defined in helpers.py)
if args_task == 'qa':
    prepare_train_dataset = lambda exs: prepare_train_dataset_qa(exs, tokenizer)
    prepare_eval_dataset = lambda exs: prepare_validation_dataset_qa(exs, tokenizer)
elif args_task == 'nli':
    prepare_train_dataset = prepare_eval_dataset = \
        lambda exs: prepare_dataset_nli(exs, tokenizer, args_max_length)
    # prepare_eval_dataset = prepare_dataset_nli
else:
    raise ValueError('Unrecognized task name: {}'.format(args_task))

print("Preprocessing data... (this takes a little bit, should only happen once per dataset)")
if dataset_id == ('snli',):
    # remove SNLI examples with no label
    dataset = dataset.filter(lambda ex: ex['label'] != -1)

train_dataset = None
eval_dataset = None
train_dataset_featurized = None
eval_dataset_featurized = None
if training_args_do_train:
    train_dataset = dataset['train']
    if args_max_train_samples:
        train_dataset = train_dataset.select(range(args_max_train_samples))
    train_dataset_featurized = train_dataset.map(
        prepare_train_dataset,
        batched=True,
        num_proc=NUM_PREPROCESSING_WORKERS,
        remove_columns=train_dataset.column_names
    )
if training_args_do_eval:
    eval_dataset = dataset[eval_split]
    if args_max_eval_samples:
        eval_dataset = eval_dataset.select(range(args_max_eval_samples))
    eval_dataset_featurized = eval_dataset.map(
        prepare_eval_dataset,
        batched=True,
        num_proc=NUM_PREPROCESSING_WORKERS,
        remove_columns=eval_dataset.column_names
    )


In [None]:
# generate some stats from eval_output_original

import jsonlines

stats = {}
with jsonlines.open(args_dataset) as f:
    for line in f.iter():
        q_type = line['question'].split(" ")[0].lower()
        correct = line['predicted_answer'] in line['answers']['text']
        try:
            stats[q_type]['total'] = stats.get(q_type)['total'] + 1
            if correct:
                stats[q_type]['right'] = stats.get(q_type)['right'] + 1
            else:
                stats[q_type]['wrong'] = stats.get(q_type)['wrong'] + 1
        except:
            if correct:
                stats[q_type] = {'total':1, 'right':1, 'wrong':0}
            else:
                stats[q_type] = {'total':1, 'right':0, 'wrong':1}

for stat in stats:
    stats[stat]['success_rate'] = stats[stat]['right'] / stats[stat]['total']
stats

with jsonlines.open('stats.jsonl', mode='w') as writer:
    for stat in stats:
        writer.write((stat, stats[stat]))


In [None]:
# save all wrong predictions from eval_output_original to a list

wrong_preds = []
total = 0
correct = 0
with jsonlines.open(args_dataset) as f:
    for line in f.iter():
        if total == 0: print(line.keys())
        total += 1
        # print(line['answers']['text'])
        if line['predicted_answer'] in line['answers']['text']:
            correct+=1
        else:
            wrong_preds.append(line)

score = correct/total
score


In [None]:
# show some random wrong predictions

import random

for i in range(5):
    idx = random.randint(0,len(wrong_preds))
    print(wrong_preds[idx]['id'], wrong_preds[idx]['context'])
    print("QUESTION:", wrong_preds[idx]['question'])
    print("ANSWERS:", wrong_preds[idx]['answers']['text'])
    print("WRONG PREDICTION:", wrong_preds[idx]['predicted_answer'], "\n")
    

In [None]:
# save all wrong predictions from eval_output_original to a .jsonl

with jsonlines.open('wrong_preds_original.jsonl', mode='w') as writer:
    for pred in wrong_preds:
        writer.write(pred)

In [None]:
# save 50 wrong subject predictions o a .jsonl

with jsonlines.open('wrong_preds_subjects.jsonl', mode='w') as writer:
    samples = 50
    i=0
    # for i in range(50):
    while i < samples:
        idx = random.randint(0,len(wrong_preds)-1)
        if len(wrong_preds[idx]['predicted_answer'].split(" ")) == 2:
            if wrong_preds[idx]['question'].split(" ")[0] in ['Who', 'What']:
                words = wrong_preds[idx]['predicted_answer'].split(" ")
                if words[0][0].isupper() and words[1][0].isupper():
                    writer.write(wrong_preds[idx])
                    i+=1


In [261]:
# run model on custom data set
!python run.py --do_eval --task qa --dataset ./custom_sets/custom01.jsonl --model ./trained_model/ --output_dir ./cust_output_01/

Using custom data configuration default-66c5ccf7f0cc6c96

0 tables [00:00, ? tables/s]
                            

 #0:   0%|          | 0/1 [00:00<?, ?ba/s]
 #0: 100%|██████████| 1/1 [00:00<00:00, 35.72ba/s]


Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to C:\Users\priet\.cache\huggingface\datasets\json\default-66c5ccf7f0cc6c96\0.0.0\45636811569ec4a6630521c18235dfbbab83b7ab572e3393c5ba68ccabe98264...
Dataset json downloaded and prepared to C:\Users\priet\.cache\huggingface\datasets\json\default-66c5ccf7f0cc6c96\0.0.0\45636811569ec4a6630521c18235dfbbab83b7ab572e3393c5ba68ccabe98264. Subsequent calls will reuse this data.
Preprocessing data... (this takes a little bit, should only happen once per dataset)




 #1:   0%|          | 0/1 [00:00<?, ?ba/s][A
 #1: 100%|██████████| 1/1 [00:00<00:00, 41.67ba/s]
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id.
***** Running Evaluation *****
  Num examples = 12
  Batch size = 8

  0%|          | 0/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:02<00:00,  1.09s/it]

  0%|          | 0/12 [00:00<?, ?it/s][A
100%|██████████| 12/12 [00:00<00:00, 413.87it/s]
Traceback (most recent call last):
  File "C:\Users\priet\anaconda3\envs\nlp3.6\lib\site-packages\datasets\metric.py", line 435, in add_batch
    self.writer.write_batch(batch)
  File "C:\Users\priet\anaconda3\envs\nlp3.6\lib\site-packages\datasets\arrow_writer.py", line 391, in write_batch
    pa_table = pa.Table.from_pydict(typed_sequence_examples)
  File "pyarrow\table.pxi", line 1724, in pyarrow.lib.Table.from_pydict
  File "pyarrow\table.pxi", line 2369, in pyarrow.lib._fro