In [1]:
%load_ext autoreload
%autoreload 2

In [17]:
import dataclasses
import logging
import os
import sys
sys.path.append("..")
from dataclasses import dataclass, field
from typing import Callable, Dict, Optional

import numpy as np
from transformers import (AutoConfig, AutoModelForSequenceClassification,
                          AutoTokenizer, EvalPrediction)
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import (HfArgumentParser, Trainer, TrainingArguments,
                          glue_compute_metrics, glue_output_modes,
                          glue_tasks_num_labels, set_seed)

from datasets.hans_dataset import HansDataset
from datasets.hans_processors import glue_output_modes, glue_tasks_num_labels

In [4]:
logger = logging.getLogger(__name__)


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

In [53]:
model_args = ModelArguments(model_name_or_path = 'albert-base-v2')
data_args = DataTrainingArguments(task_name = 'hans', data_dir = '/home/nlp/data/hans/')
training_args = TrainingArguments(output_dir = '/home/nlp/experiments/trial',
                                 do_eval = True)


if (
    os.path.exists(training_args.output_dir)
    and os.listdir(training_args.output_dir)
    and training_args.do_train
    and not training_args.overwrite_output_dir
):
    raise ValueError(
        f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
    )

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)

# Set seed
set_seed(training_args.seed)

try:
    num_labels = glue_tasks_num_labels[data_args.task_name]
    output_mode = glue_output_modes[data_args.task_name]
except KeyError:
    raise ValueError("Task not found: %s" % (data_args.task_name))

# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.

06/05/2020 13:45:43 - INFO - transformers.training_args -   PyTorch: setting up devices
06/05/2020 13:45:43 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='/home/nlp/experiments/trial', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, evaluate_during_training=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir=None, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False)


In [7]:
training_args.seed

42

In [54]:
config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    num_labels=num_labels,
    finetuning_task=data_args.task_name,
    cache_dir=model_args.cache_dir,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
)

06/05/2020 13:45:45 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json from cache at /home/nlp/.cache/torch/transformers/0bbb1531ce82f042a813219ffeed7a1fa1f44cd8f78a652c47fc5311e0d40231.978ff53dd976bbf4bc66f09bf4205da0542be753d025263787842df74d15bbca
06/05/2020 13:45:45 - INFO - transformers.configuration_utils -   Model config AlbertConfig {
  "architectures": [
    "AlbertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "finetuning_task": "hans",
  "gap_size": 0,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
  

In [15]:
eval_dataset = HansDataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None

06/05/2020 12:10:33 - INFO - filelock -   Lock 140635937441536 acquired on /home/nlp/data/hans/cached_dev_AlbertTokenizer_128_hans.lock
06/05/2020 12:10:33 - INFO - datasets.hans_dataset -   Creating features from dataset file at /home/nlp/data/hans/
06/05/2020 12:10:33 - INFO - datasets.hans_processors -   Writing example 0
06/05/2020 12:10:33 - INFO - datasets.hans_processors -   *** Example ***
06/05/2020 12:10:33 - INFO - datasets.hans_processors -   text_a: The president advised the doctor .
06/05/2020 12:10:33 - INFO - datasets.hans_processors -   text_b: The doctor advised the president .
06/05/2020 12:10:33 - INFO - datasets.hans_processors -   guid: dev-non-entailment
06/05/2020 12:10:33 - INFO - datasets.hans_processors -   input_ids: 2 14 406 10017 14 1687 13 9 3 14 1687 10017 14 406 13 9 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

06/05/2020 12:10:33 - INFO - datasets.hans_processors -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/05/2020 12:10:33 - INFO - datasets.hans_processors -   token_type_ids: 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/05/2020 12:10:33 - INFO - datasets.hans_processors -   label: temp1 (id = 0)
06/05/2020 12:10:33 - INFO - datasets.hans_processors -   *** Example ***
06/05/2020 12:10:33 - INFO - datasets.hans_processors -   text_a: The managers saw the secretaries .
06/05/2020 12:10:33 - INFO - datasets.hans_processors -   text_b: The secretaries saw

In [18]:
def build_compute_metrics_fn(task_name: str) -> Callable[[EvalPrediction], Dict]:
    def compute_metrics_fn(p: EvalPrediction) -> Dict:
        if output_mode == "classification":
            preds = np.argmax(p.predictions, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(p.predictions)
        return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

    return compute_metrics_fn

In [19]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=None,
        eval_dataset=eval_dataset,
        compute_metrics=build_compute_metrics_fn(data_args.task_name),
    )

06/05/2020 12:11:39 - INFO - transformers.trainer -   Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


06/05/2020 12:11:40 - INFO - wandb.run_manager -   system metrics and metadata threads started
06/05/2020 12:11:40 - INFO - wandb.run_manager -   checking resume status, waiting at most 10 seconds
06/05/2020 12:11:41 - INFO - wandb.run_manager -   resuming run from id: UnVuOnYxOjE2YXMyMGZvOmh1Z2dpbmdmYWNlOmNhbHZpbg==
06/05/2020 12:11:41 - INFO - wandb.run_manager -   file/dir created: /home/nlp/transformers-importance-sampling/nbs/wandb/run-20200605_031139-16as20fo/wandb-metadata.json
06/05/2020 12:11:41 - INFO - wandb.run_manager -   upserting run before process can begin, waiting at most 10 seconds
06/05/2020 12:11:41 - INFO - wandb.run_manager -   saving pip packages
06/05/2020 12:11:41 - INFO - wandb.run_manager -   initializing streaming files api
06/05/2020 12:11:41 - INFO - wandb.run_manager -   unblocking file change observer, beginning sync with W&B servers
06/05/2020 12:11:41 - INFO - wandb.run_manager -   file/dir created: /home/nlp/transformers-importance-sampling/nbs/wandb

In [21]:
result = trainer.evaluate(eval_dataset=eval_dataset)

06/05/2020 12:14:04 - INFO - transformers.trainer -   ***** Running Evaluation *****
06/05/2020 12:14:04 - INFO - transformers.trainer -     Num examples = 30000
06/05/2020 12:14:04 - INFO - transformers.trainer -     Batch size = 16


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=1875.0, style=ProgressStyle(description_…






06/05/2020 12:15:00 - INFO - wandb.run_manager -   system metrics and metadata threads started
06/05/2020 12:15:00 - INFO - wandb.run_manager -   checking resume status, waiting at most 10 seconds
06/05/2020 12:15:00 - INFO - wandb.run_manager -   resuming run from id: UnVuOnYxOjE2YXMyMGZvOmh1Z2dpbmdmYWNlOmNhbHZpbg==
06/05/2020 12:15:01 - INFO - wandb.run_manager -   upserting run before process can begin, waiting at most 10 seconds
06/05/2020 12:15:01 - INFO - wandb.run_manager -   file/dir modified: /home/nlp/transformers-importance-sampling/nbs/wandb/run-20200605_031139-16as20fo/config.yaml
06/05/2020 12:15:01 - INFO - wandb.run_manager -   saving pip packages
06/05/2020 12:15:01 - INFO - wandb.run_manager -   initializing streaming files api
06/05/2020 12:15:01 - INFO - wandb.run_manager -   unblocking file change observer, beginning sync with W&B servers
06/05/2020 12:15:01 - INFO - wandb.run_manager -   shutting down system stats and metadata service
06/05/2020 12:15:02 - INFO - 

{"eval_loss": 1.1784355945587157, "eval_acc": 0.04203333333333333, "step": null}


06/05/2020 12:15:02 - INFO - wandb.run_manager -   file/dir modified: /home/nlp/transformers-importance-sampling/nbs/wandb/run-20200605_031139-16as20fo/requirements.txt
06/05/2020 12:15:02 - INFO - wandb.run_manager -   file/dir modified: /home/nlp/transformers-importance-sampling/nbs/wandb/run-20200605_031139-16as20fo/wandb-events.jsonl
06/05/2020 12:15:02 - INFO - wandb.run_manager -   file/dir modified: /home/nlp/transformers-importance-sampling/nbs/wandb/run-20200605_031139-16as20fo/wandb-summary.json
06/05/2020 12:15:02 - INFO - wandb.run_manager -   file/dir modified: /home/nlp/transformers-importance-sampling/nbs/wandb/run-20200605_031139-16as20fo/wandb-history.jsonl
06/05/2020 12:15:02 - INFO - wandb.run_manager -   stopping streaming files and file change observer
06/05/2020 12:15:03 - INFO - wandb.run_manager -   file/dir modified: /home/nlp/transformers-importance-sampling/nbs/wandb/run-20200605_031139-16as20fo/wandb-metadata.json


In [22]:
result

{'eval_loss': 1.1784355945587157, 'eval_acc': 0.04203333333333333}

In [64]:
config.inner_group_num = 6

In [65]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_args.model_name_or_path,
    config=config,
    cache_dir=model_args.cache_dir,
)

06/05/2020 15:15:46 - INFO - transformers.modeling_utils -   loading weights file https://cdn.huggingface.co/albert-base-v2-pytorch_model.bin from cache at /home/nlp/.cache/torch/transformers/c7c1b2b621933bfa9a5f6ed18b1d6dc2f445001779b13d37286a806117ebeb10.ab806923413c2af99835e13fdbb6014b24af86b0de8edc2d71ef5c646fc54f24
06/05/2020 15:15:47 - INFO - transformers.modeling_utils -   Weights of AlbertForSequenceClassification not initialized from pretrained model: ['albert.encoder.albert_layer_groups.0.albert_layers.1.full_layer_layer_norm.weight', 'albert.encoder.albert_layer_groups.0.albert_layers.1.full_layer_layer_norm.bias', 'albert.encoder.albert_layer_groups.0.albert_layers.1.attention.query.weight', 'albert.encoder.albert_layer_groups.0.albert_layers.1.attention.query.bias', 'albert.encoder.albert_layer_groups.0.albert_layers.1.attention.key.weight', 'albert.encoder.albert_layer_groups.0.albert_layers.1.attention.key.bias', 'albert.encoder.albert_layer_groups.0.albert_layers.1.atte

In [66]:
model.albert.encoder.albert_layer_groups

ModuleList(
  (0): AlbertLayerGroup(
    (albert_layers): ModuleList(
      (0): AlbertLayer(
        (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (attention): AlbertAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=True)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0, inplace=False)
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        )
        (ffn): Linear(in_features=768, out_features=3072, bias=True)
        (ffn_output): Linear(in_features=3072, out_features=768, bias=True)
      )
      (1): AlbertLayer(
        (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (attention): AlbertAttention(
          (query): Linear(in_features=768, out_features=7

In [9]:
from transformers.data.processors import glue_processors

In [10]:
processor = glue_processors[data_args.task_name]()

In [11]:
processor.get_dev_examples(data_args.data_dir)

FileNotFoundError: [Errno 2] No such file or directory: '/home/nlp/data/hans/dev_matched.tsv'

In [34]:
f = open("/home/nlp/data/hans/heuristics_evaluation_set.txt", "r")

In [33]:
for x in f:
    print(x)

In [49]:
f.readline()

'non-entailment\t( ( The senators ) ( ( contacted ( the scientists ) ) . ) )\t( ( The scientists ) ( ( contacted ( the senators ) ) . ) )\t(ROOT (S (NP (DT The) (NNS senators)) (VP (VBD contacted) (NP (DT the) (NNS scientists))) (. .)))\t(ROOT (S (NP (DT The) (NNS scientists)) (VP (VBD contacted) (NP (DT the) (NNS senators))) (. .)))\tThe senators contacted the scientists .\tThe scientists contacted the senators .\tex12\tlexical_overlap\tln_subject/object_swap\ttemp1\n'

In [61]:
from hans.hans_processors import HansProcessor, hans_convert_examples_to_features

In [55]:
hans_proc = HansProcessor()

In [60]:
examples = hans_proc.get_dev_examples("/home/nlp/data/hans/")

In [62]:
label_list = hans_proc.get_labels()

In [59]:
hans_proc.get_labels()

['contradiction', 'entailment', 'neutral']

In [63]:
output_mode = glue_output_modes[data_args.task_name]

In [65]:
features = hans_convert_examples_to_features(examples, tokenizer, max_length = data_args.max_seq_length,
                                            label_list = label_list, output_mode = output_mode)

05/20/2020 22:34:59 - INFO - hans.hans_processors -   Writing example 0
05/20/2020 22:34:59 - INFO - hans.hans_processors -   *** Example ***
05/20/2020 22:34:59 - INFO - hans.hans_processors -   text_a: The president advised the doctor .
05/20/2020 22:34:59 - INFO - hans.hans_processors -   text_b: The doctor advised the president .
05/20/2020 22:34:59 - INFO - hans.hans_processors -   guid: dev-non-entailment
05/20/2020 22:34:59 - INFO - hans.hans_processors -   input_ids: 2 14 406 10017 14 1687 13 9 3 14 1687 10017 14 406 13 9 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/20/2020 22:34:59 - INFO - hans.hans_processors -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

05/20/2020 22:34:59 - INFO - hans.hans_processors -   label: temp1 (id = 0)
05/20/2020 22:34:59 - INFO - hans.hans_processors -   *** Example ***
05/20/2020 22:34:59 - INFO - hans.hans_processors -   text_a: The managers saw the secretaries .
05/20/2020 22:34:59 - INFO - hans.hans_processors -   text_b: The secretaries saw the managers .
05/20/2020 22:34:59 - INFO - hans.hans_processors -   guid: dev-non-entailment
05/20/2020 22:34:59 - INFO - hans.hans_processors -   input_ids: 2 14 12657 441 14 25738 13 9 3 14 25738 441 14 12657 13 9 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/20/2020 22:34:59 - INFO - hans.hans_processors -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [66]:
class HansDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach
    soon.
    """

    args: GlueDataTrainingArguments
    output_mode: str
    features: List[InputFeatures]

    def __init__(
        self,
        args: GlueDataTrainingArguments,
        tokenizer: PreTrainedTokenizer,
        limit_length: Optional[int] = None,
        evaluate=False,
    ):
        self.args = args
        processor = HansProcessor()
        self.output_mode = glue_output_modes[args.task_name]
        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            args.data_dir,
            "cached_{}_{}_{}_{}".format(
                "dev" if evaluate else "train", tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
            ),
        )

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not args.overwrite_cache:
                start = time.time()
                self.features = torch.load(cached_features_file)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )
            else:
                logger.info(f"Creating features from dataset file at {args.data_dir}")
                label_list = processor.get_labels()
                if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
                    RobertaTokenizer,
                    RobertaTokenizerFast,
                    XLMRobertaTokenizer,
                ):
                    # HACK(label indices are swapped in RoBERTa pretrained model)
                    label_list[1], label_list[2] = label_list[2], label_list[1]
                examples = (
                    processor.get_dev_examples(args.data_dir)
                    if evaluate
                    else processor.get_train_examples(args.data_dir)
                )
                if limit_length is not None:
                    examples = examples[:limit_length]
                self.features = hans_convert_examples_to_features(
                    examples,
                    tokenizer,
                    max_length=args.max_seq_length,
                    label_list=label_list,
                    output_mode=self.output_mode,
                )
                start = time.time()
                torch.save(self.features, cached_features_file)
                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )

    def __len__(self):
        return len(self.features)

    def __getitem__(self, i) -> InputFeatures:
        return self.features[i]

{
  "attention_mask": [
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0
  ],
  "input_ids": [
    2,
    14,
    406,
    10017,
    14,
    1687,
    13

In [None]:
training_args.overwrite_output_dir

In [11]:
data_args.ou

device(type='cuda')