In [50]:
import dataclasses
import logging
import os
import sys
sys.path.append("..")
import math
from dataclasses import dataclass, field
from typing import Callable, Dict, List, Tuple, Optional

import numpy as np
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler

from transformers import (AutoConfig, AutoModelForSequenceClassification,
                          AutoTokenizer, EvalPrediction, GlueDataset, default_data_collator) 
# from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import (HfArgumentParser, Trainer, TrainingArguments,
                          glue_compute_metrics, glue_output_modes,
                          glue_tasks_num_labels, set_seed)
from transformers import PreTrainedModel
from transformers.data.data_collator import DataCollator
import pandas as pd
import higher
from torch.optim.sgd import SGD
from torch.optim.adam import Adam


from tqdm import tqdm, trange
from fluence.meta import MetaDataset

In [2]:
logger = logging.getLogger(__name__)

@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

In [3]:
model_args = ModelArguments(model_name_or_path = 'albert-base-v2')
data_args = DataTrainingArguments(task_name = 'MRPC', data_dir = '/home/nlp/data/glue_data/MRPC')
training_args = TrainingArguments(output_dir = '/home/nlp/experiments/meta/',
                                 do_eval = True,
                                 per_device_train_batch_size=64)


if (
    os.path.exists(training_args.output_dir)
    and os.listdir(training_args.output_dir)
    and training_args.do_train
    and not training_args.overwrite_output_dir
):
    raise ValueError(
        f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
    )

# Set seed
set_seed(training_args.seed)

try:
    num_labels = glue_tasks_num_labels[data_args.task_name]
    output_mode = glue_output_modes[data_args.task_name]
except KeyError:
    raise ValueError("Task not found: %s" % (data_args.task_name))

In [81]:
config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    num_labels=num_labels,
    finetuning_task=data_args.task_name,
    cache_dir=model_args.cache_dir,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
)


In [5]:
def build_compute_metrics_fn(task_name: str) -> Callable[[EvalPrediction], Dict]:
    def compute_metrics_fn(p: EvalPrediction) -> Dict:
        if output_mode == "classification":
            preds = np.argmax(p.predictions, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(p.predictions)
        return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

    return compute_metrics_fn

In [6]:
@dataclass
class Meta_Arguments(TrainingArguments):
    train_task: List = field(default = None, metadata = 'Support dataset')
    eval_task: List = field(default = None, metadata = 'Query dataset')
    data_dir: str = field(default = None)
    inner_learning_rate: float = field(default = 1e-3)
    outer_learning_rate: float = field(default = 2e-5)
    max_len: int = field(default = 80)
    eval_method: str = field(default = None)
    max_seq_length: int = field(
    default=128,
    metadata={
        "help": "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded."
    },
    )
    overwrite_cache: bool = field(
    default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )

In [79]:
meta_args = Meta_Arguments(data_dir = '/home/nlp/data/glue_data/',
                           output_dir = '/home/nlp/experiments/fluence_test', 
                           train_task = 'mrpc', eval_task='sst-2',
                           eval_method = 'every_2',
                           num_train_epochs = 1,
                           per_device_train_batch_size=1,
                           per_device_eval_batch_size=512
                          )

In [31]:
meta_args.device

device(type='cuda', index=0)

In [60]:
data_dir = '/home/nlp/data/glue_data/'

In [80]:
meta_args.task_name = meta_args.train_task
meta_args.data_dir = data_dir + 'MRPC'
train_dataset = GlueDataset(meta_args, tokenizer=tokenizer)
meta_dataset = MetaDataset(train_dataset)
meta_args.task_name = meta_args.eval_task
meta_args.data_dir = data_dir + 'SST-2'
eval_dataset = GlueDataset(meta_args, tokenizer=tokenizer, mode="dev")


100%|██████████| 1194/1194 [00:00<00:00, 13820.58it/s]


In [33]:
eval_dl = DataLoader(
            eval_dataset,
            collate_fn = default_data_collator
            )

In [82]:
@dataclass
class MetaTrainer(Trainer):
    def __init__(
        self,
        model: PreTrainedModel,
        args: TrainingArguments,
        train_dataset: MetaDataset,
        eval_dataset: DataLoader,
        train_data_collator: Optional[DataCollator] = None,
        eval_data_collator: Optional[DataCollator] = None,
        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
        prediction_loss_only=False,
        optimizers: Tuple[
            torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR
        ] = None,
    ):

        self.model = model.to(args.device)
        self.args = args
        self.compute_metrics = compute_metrics
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.train_data_collator = train_data_collator
        self.eval_data_collator = eval_data_collator if not None else default_data_collator
        self.prediction_loss_only = prediction_loss_only
        self.optimizers = optimizers
        self.eval_results = {}
        self._setup_wandb()
        set_seed(self.args.seed)

    def get_loss_mean(self, loss):
        return loss.mean() if self.args.n_gpu > 1 else loss

    def get_train_dataloader(self) -> DataLoader:
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        train_sampler = (
            RandomSampler(self.train_dataset)
            if self.args.local_rank == -1
            else DistributedSampler(self.train_dataset)
        )

        data_loader = DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=train_sampler,
            collate_fn=self.train_data_collator,
            drop_last=self.args.dataloader_drop_last,
        )

        return data_loader
    
    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
        if eval_dataset is None and self.eval_dataset is None:
            raise ValueError("Trainer: evaluation requires an eval_dataset.")

        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset

        if self.args.local_rank != -1:
            sampler = SequentialDistributedSampler(eval_dataset)
        else:
            sampler = SequentialSampler(eval_dataset)

        data_loader = DataLoader(
            eval_dataset,
            sampler=sampler,
            batch_size=self.args.eval_batch_size,
            collate_fn=self.eval_data_collator,
            drop_last=self.args.dataloader_drop_last,
        )

        return data_loader
    
    def put_on_device(self, inputs):
        for k, v in inputs.items():
            inputs[k] = v.to(self.args.device)
        return inputs
    
    def train(self):
        train_dataloader = self.get_train_dataloader()
        eval_dataloader = self.get_eval_dataloader(self.eval_dataset)
        columns = [self.args.train_task, self.args.eval_task]
        metrics = [
            "eval_loss",
            "eval_acc",
            "eval_f1",
            "eval_acc_and_f1",
            "eval_mnli-mm/acc",
        ]
        df = pd.DataFrame(columns=columns, index=metrics)
        for i in range(len(df.columns)):
            for j in range(len(metrics)):
                df[columns[i]][metrics[j]] = []

        model = self.model
        optimizer, scheduler = self.get_optimizers(
            int(
                len(train_dataloader)
                // self.args.gradient_accumulation_steps
                * self.args.num_train_epochs
            )
        )

        if self.args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        if self.args.local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device=[self.args.local_rank],
                output_device=self.args.local_rank,
                find_unused_parameters=True,
            )
        # TODO: Make calculation of num_epochs with HF
        num_train_epochs = self.args.num_train_epochs
        total_train_batch_size = (
            self.args.train_batch_size
            * self.args.gradient_accumulation_steps
            * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1)
        )

        logger.info("***** Running training *****")
        logger.info("  Num Epochs = %d", num_train_epochs)
        logger.info(
            "  Instantaneous batch size per device = %d",
            self.args.per_device_train_batch_size,
        )
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            total_train_batch_size,
        )
        logger.info(
            "  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps
        )

        model.zero_grad()
        self.global_step = 0

        if self.args.eval_method == 'every_2':
            eval_step = [2 ** i for i in range(1, 20)]
        
        inner_optimizer = torch.optim.SGD(
            model.parameters(), lr=self.args.learning_rate
        )
        model.train()

        tqdm_iterator = tqdm(train_dataloader, desc="Batch Index")
        
        for epoch in tqdm(range(int(self.args.num_train_epochs))):
            for batch_idx, meta_batch in enumerate(tqdm_iterator):
                model.zero_grad()
                target_batch = next(iter(eval_dataloader))
                outer_loss = 0.0
                for inputs, target_inputs in zip(meta_batch, target_batch):

                    inputs = self.put_on_device(inputs)
                    target_inputs = self.put_on_device(inputs)

                    with higher.innerloop_ctx(
                        model, inner_optimizer, copy_initial_weights=False
                    ) as (fmodel, diffopt):

                        inner_loss = model(**inputs)[0]
                        inner_loss = self.get_loss_mean(inner_loss)
                        diffopt.step(inner_loss)
                        outer_loss += model(**target_inputs)[0]

                self.global_step += 1
                outer_loss = self.get_loss_mean(outer_loss)
                outer_loss.backward()
                optimizer.step()

                if (batch_idx + 1) % self.args.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(
                        model.parameters(), self.args.max_grad_norm
                    )

                # Run evaluation on task list
                if self.global_step in eval_step:
                    result = self.evaluate(eval_dataloader.dataset)
                    for key, value in result.items():
                        logger.info(
                            "%s  %s = %s",
                            self.args.eval_task,
                            key,
                            value,
                        )
                    df[self.args.train_task][key].append(value)

                # Save model
                if (
                    self.args.save_steps > 0
                    and self.global_step % self.args.save_steps == 0
                ):
                    if hasattr(model, "module"):
                        assert model.module is self.model
                    else:
                        assert model is self.model

                    output_dir = os.path.join(
                        self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}",
                    )

                    self.save_model(output_dir)
                    if self.is_world_master():
                        self._rotate_checkpoints()

                    logging.info(
                        "*** Results have been saved at %s ***", self.args.output_dir
                    )
                    df.to_csv(self.args.output_dir + self.args.output_file_name + ".csv")

In [83]:
meta_trainer = MetaTrainer(model = model,
                           args = meta_args,
                           train_dataset = meta_dataset,
                           eval_dataset = eval_dataset,
                           train_data_collator = torch.utils.data._utils.collate.default_collate,
                           eval_data_collator = default_data_collator)

wandb: ERROR Error uploading "wandb-metadata.json": CommError, /tmp/tmpr5mn1jg4wandb/2mixgk7o-wandb-metadata.json is an empty file


In [76]:
meta_trainer = Trainer(model = model,
                           args = meta_args,
                           train_dataset = meta_dataset,
                           eval_dataset = eval_dataset,
                           data_collator = default_data_collator)

In [66]:
meta_trainer.train()


Batch Index:   0%|          | 0/597 [00:00<?, ?it/s][A




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=2.0, style=ProgressStyle(description_wid…

Batch Index:   0%|          | 1/597 [00:02<20:24,  2.05s/it]
  0%|          | 0/1 [00:02<?, ?it/s]







RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/nlp/.local/lib/python3.8/site-packages/transformers/modeling_albert.py", line 923, in forward
    loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/nn/modules/loss.py", line 931, in forward
    return F.cross_entropy(input, target, weight=self.weight,
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/nn/functional.py", line 2317, in cross_entropy
    return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
  File "/home/nlp/.local/lib/python3.8/site-packages/torch/nn/functional.py", line 2115, in nll_loss
    ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #2 'target' in call to _thnn_nll_loss_forward


In [86]:
eval_loss = meta_trainer.evaluate()

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=1.0, style=ProgressStyle(description_wid…




In [87]:
eval_loss['eval_loss']

0.7099567651748657

In [73]:
for i in eval_dl:
    for k, v in i.items():
        if k == 'labels' and not isinstance(v, torch.LongTensor):
            print(k, v.type())

In [10]:
#args.tasks = ['mnli']

In [11]:
# processor_dict = {
#           'mrpc': MrpcProcessor,
#           'cola': ColaProcessor,
#           'mnli': MnliProcessor,
#           'sst-2': Sst2Processor,
#           'rte': RteProcessor,
#           'wnli': WnliProcessor,
#           'qqp': QqpProcessor,
#           'qnli': QnliProcessor,
#           'sts-b': StsbProcessor
#         }
# processors = [processor_dict[task]() for task in args.tasks]

In [12]:
# GLUE_PATH = os.path.join('home', 'nlp', 'data', 'glue_data')
# dataset_dict = {
#           'mrpc': args.glue_dir+'/MRPC',
#           'cola': args.glue_dir+'/CoLA',
#           'mnli': args.glue_dir+'/MNLI',
#           'sst-2': args.glue_dir+'/SST-2',
#           'rte':  args.glue_dir+'/RTE',
#           'wnli': args.glue_dir+'/WNLI',
#           'qqp':  args.glue_dir+'/QQP',
#           'qnli': args.glue_dir+'/QNLI',
#           'sts-b': args.glue_dir+'/STS-B'
#         }
# data_dirs =  [dataset_dict[task] for task in args.tasks]

In [13]:
# for i, task in enumerate(args.tasks):
#         if task == args.target_task:
#             target_task_id = i
#             break

# task_cluster_dict = {
#       'mrpc': 0,
#       'cola': 1,
#       'mnli': 0,
#       'sst-2': 1,
#       'rte': 0,
#       'wnli': 0,
#       'qqp': 0,
#       'qnli': 2,
#       'sts-b': 3
#     }
# task_clusters = [task_cluster_dict[task] for task in args.tasks] if args.task_shared else None

In [14]:
# label_lists = [processor.get_labels() for processor in processors]

In [15]:
# task_clusters

[0, 1, 0, 1, 0, 0, 2, 3]

In [16]:
# label_lists

[['0', '1'],
 ['0', '1'],
 ['contradiction', 'entailment', 'neutral'],
 ['0', '1'],
 ['entailment', 'not_entailment'],
 ['0', '1'],
 ['entailment', 'not_entailment'],
 [None]]

In [17]:
# if not args.task_shared:
#     num_labels = [len(label_list) for label_list in label_lists]
# else:
#     cluster_num_labels = {0: 3, 1: 2, 2: 2, 3: 1}
#     num_labels = [cluster_num_labels[task_cluster] for task_cluster in task_clusters]

In [18]:
model.zero_grad()

In [19]:
num_labels

[3, 2, 3, 2, 3, 3, 2, 1]

In [20]:
 glue_output_modes[args.tasks[0]]

'classification'

In [21]:
data_dirs

['/home/nlp/data/glue_data/MRPC',
 '/home/nlp/data/glue_data/CoLA',
 '/home/nlp/data/glue_data/MNLI',
 '/home/nlp/data/glue_data/SST-2',
 '/home/nlp/data/glue_data/RTE',
 '/home/nlp/data/glue_data/QQP',
 '/home/nlp/data/glue_data/QNLI',
 '/home/nlp/data/glue_data/STS-B']

In [22]:
train_dataset_list, eval_dataset_list = [], []
for task, data_dir in tqdm(zip(args.tasks, data_dirs)):
    data_args.task_name = task
    data_args.data_dir = data_dir
    train_dataset_list.append(GlueDataset(data_args, tokenizer))
    eval_dataset_list.append(GlueDataset(data_args, tokenizer, mode = "dev"))

8it [00:24,  3.01s/it]


In [23]:
# from collections import defaultdict, OrderedDict


# class MetaDataset(torch.utils.data.Dataset):
#     def __init__(self, dataset):
#         self.dataset = dataset
#         self.args =  self.dataset.args
#         self.processor = self.dataset.processor
#         self.features = self.dataset.features
#         self.label_list = self.dataset.label_list
#         self.output_mode = self.dataset.output_mode
#         self.indices_mapping = self._get_indices_mapping()
#         self.num_labels = len(self.indices_mapping.keys())
#         self.min_len = self.get_len()
#         #self.data = self.form_data()
#         self.data = self.get_tensorized_data()
    
#     def get_len(self):
#         min_len = float("inf")
#         for values in self.indices_mapping.values():
#             min_len = min(len(values), min_len)
#         return min_len
        
#     def __len__(self):
#         return self.min_len
    
#     def _get_indices_mapping(self):
#         indices_mapping = {}
#         for idx, data in enumerate(self.dataset):
#             indices_mapping.update({idx: data.label})
        
#         temp_mapping = defaultdict(list)
#         for key, value in sorted(indices_mapping.items()):
#             temp_mapping[value].append(key)
        
#         indices_mapping = temp_mapping
#         del temp_mapping
#         return indices_mapping
        
#     def __getitem__(self, idx):
#         #res = []
#         # res = OrderedDict()
#         #for label in self.indices_mapping.keys():
#             # res[label] = self.features[self.indices_mapping[label][idx]]
#         #    res.append(self.features[self.indices_mapping[label][idx]])
        
#         return self.data[idx]
    
#     def get_tensorized_data(self):
#         tensorized_data = []
#         dtype = torch.long
#         #for i, data in tqdm(enumerate(self.data)):
#         for idx in trange(self.min_len):
#             res = []
#             for label in range(self.num_labels):
#                 data = self.features[self.indices_mapping[label][idx]]
#                 res.append({
#                     'input_ids': torch.tensor(data.input_ids, dtype=dtype),
#                     'attention_mask': torch.tensor(data.attention_mask, dtype=dtype),
#                     'token_type_ids': torch.tensor(data.token_type_ids, dtype=dtype),
#                     'labels': torch.tensor(data.label, dtype=dtype)
#                 })
#             tensorized_data.append(res)
#         return tensorized_data
    


In [40]:
len(meta_dataset[1000])

2

In [47]:
sampler = RandomSampler(meta_dataset)

In [48]:
dataloader = DataLoader(meta_dataset,
            batch_size=8,
            drop_last=True)

In [49]:
batch = next(iter(dataloader))

In [50]:
batch[0]['input_ids'].shape

torch.Size([8, 128])

In [51]:
loss = 0.
for idx, batch in enumerate(dataloader):
    for class_sample in batch:
        loss += model(**class_sample)[0]
        print(class_sample['input_ids'].shape)
    break

torch.Size([8, 128])
torch.Size([8, 128])


In [52]:
loss = model(**next(iter(dataloader))[0])[0]

In [54]:
batch = next(iter(dataloader))

In [55]:
batch[0]['input_ids'].shape

torch.Size([8, 128])

In [33]:
train_sampler_list = []
for dataset in train_dataset_list:
    train_sampler_list.append(RandomSampler(dataset))

In [35]:
train_dataloader_list, eval_dataloader_list = [], []
data_collator = default_data_collator

for train_dataset, eval_dataset, sampler in \
    tqdm(zip(train_dataset_list, eval_dataset_list, train_sampler_list)):
    
    # train_dataloader_list.append(DataLoader(train_dataset,
    #        batch_size=training_args.train_batch_size,
    #        sampler=sampler,
    #        collate_fn=data_collator,
    #        drop_last=True))
    
    eval_dataloader_list.append(DataLoader(eval_dataset,
            batch_size=training_args.train_batch_size,
            sampler=sampler,
            collate_fn=data_collator,
            drop_last=True))

1it [00:00, 106.08it/s]


In [42]:
# train_examples = [processor.get_train_examples(data_dir) for processor, data_dir in tqdm(zip(processors, data_dirs))]

In [220]:
training_args.num_epochs = 1
training_args.per_device_train_batch_size = 1
args.num_update_steps = 0

In [43]:
# train_steps_per_task = [ math.floor((len(train_example)/training_args.per_device_train_batch_size)/(args.num_update_steps+1)) for train_example in train_examples]
# total_steps = sum(train_steps_per_task) * training_args.num_train_epochs
# print(f'Total steps: {total_steps}')

In [44]:
# train_steps_per_task

In [48]:
training_args.per_device_train_batch_size

64

In [49]:
label_lists

[['contradiction', 'entailment', 'neutral']]

In [60]:
args.tasks

['mrpc', 'cola', 'mnli', 'sst-2', 'rte', 'qqp', 'qnli', 'sts-b']

In [52]:
# t_total = int(len(train_dataloader_list) // training_args.gradient_accumulation_steps * training_args.num_train_epochs)
num_train_epochs = training_args.num_train_epochs

In [53]:
# train_dataloaders_iters = [iter(train_dataloader) for train_dataloader in train_dataloader_list]

# extra_ids = []
# for t_id in range(len(args.tasks)):
#     extra_ids += [t_id] * train_steps_per_task[t_id]  #math.ceil(len(train_examples[t_id]))
# extra_ids = np.random.choice(extra_ids, len(extra_ids), replace=False) 

In [61]:
# extra_ids

In [62]:
len(train_dataset_list)

8

In [82]:
# indices_train_dataset_list = []
# for dataset in train_dataset_list:
#     cur_len = len(dataset)
#     indices = np.arange(cur_len)
#     np.random.shuffle(indices)
#     indices_train_dataset_list.append(indices)

In [83]:
# indices_train_dataset_list

[array([311245,  45402, 358631, ..., 152286, 137138, 179078])]

In [76]:
# indices = np.arange(len(train_dataset_list[0]))

In [77]:
# np.random.shuffle(indices)

In [None]:
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

In [31]:
 def empty_memory():
    import gc
    gc.collect()
    torch.cuda.empty_cache()

In [64]:
trainer = MetaTrainer(model, args, train_dataloader_list,
                     eval_dataloader_list, build_compute_metrics_fn)

NameError: name 'MetaTrainer' is not defined

In [29]:
trainer.train()

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
Task IDs: 0it [00:00, ?it/s][A
Task IDs: 1it [00:03,  3.75s/it][A
Task IDs: 2it [00:06,  3.31s/it][A
Task IDs: 3it [00:08,  3.00s/it][A
Task IDs: 4it [00:10,  2.79s/it][A
Task IDs: 5it [00:12,  2.64s/it][A
Task IDs: 6it [00:15,  2.55s/it][A
Task IDs: 7it [00:17,  2.47s/it][A
Task IDs: 8it [00:19,  2.42s/it][A
Task IDs: 9it [00:22,  2.39s/it][A

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=26.0, style=ProgressStyle(description_wi…




{"eval_loss": 1.1688626500276418, "eval_pearson": -0.07593542849008117, "eval_spearmanr": -0.07026298108615246, "eval_corr": -0.07309920478811682, "step": 10}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=66.0, style=ProgressStyle(description_wi…


{"eval_loss": 1.7732348694945828, "eval_pearson": 0.047892375448013634, "eval_spearmanr": 0.04789237544801361, "eval_corr": 0.04789237544801363, "step": 10}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=614.0, style=ProgressStyle(description_w…


{"eval_loss": 1.125631565185634, "eval_pearson": 0.03112561928614942, "eval_spearmanr": 0.031110041573893667, "eval_corr": 0.031117830430021545, "step": 10}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=55.0, style=ProgressStyle(description_wi…


{"eval_loss": 1.5897113214839589, "eval_pearson": -0.03872658021023936, "eval_spearmanr": -0.03872658021023939, "eval_corr": -0.03872658021023938, "step": 10}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=18.0, style=ProgressStyle(description_wi…


{"eval_loss": 1.3225060535801783, "eval_pearson": -0.07009389681708172, "eval_spearmanr": -0.07272434521096278, "eval_corr": -0.07140912101402225, "step": 10}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=2527.0, style=ProgressStyle(description_…


{"eval_loss": 0.9290406081956698, "eval_pearson": 0.17356125955788465, "eval_spearmanr": 0.1917711813336911, "eval_corr": 0.18266622044578787, "step": 10}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=342.0, style=ProgressStyle(description_w…


Task IDs: 10it [02:16, 36.08s/it][A


{"eval_loss": 1.299326989734382, "eval_pearson": -0.1945533855892897, "eval_spearmanr": -0.1945780773062058, "eval_corr": -0.19456573144774775, "step": 10}



Task IDs: 11it [02:19, 25.97s/it][A
Task IDs: 12it [02:21, 18.89s/it][A
Task IDs: 13it [02:23, 13.94s/it][A
Task IDs: 14it [02:26, 10.47s/it][A
Task IDs: 15it [02:28,  8.05s/it][A
Task IDs: 16it [02:31,  6.35s/it][A
Task IDs: 17it [02:33,  5.16s/it][A
Task IDs: 18it [02:35,  4.32s/it][A
Task IDs: 19it [02:38,  3.74s/it][A

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=26.0, style=ProgressStyle(description_wi…


{"eval_loss": 0.754741173524123, "eval_pearson": 0.05367693570305286, "eval_spearmanr": 0.05343511206110535, "eval_corr": 0.053556023882079105, "step": 20}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=66.0, style=ProgressStyle(description_wi…


{"eval_loss": 1.2134370496778777, "eval_pearson": -0.0027618219290598003, "eval_spearmanr": 0.0009026285023506604, "eval_corr": -0.00092959671335457, "step": 20}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=614.0, style=ProgressStyle(description_w…


{"eval_loss": 1.2278344829230043, "eval_pearson": -0.01632747615200033, "eval_spearmanr": -0.0161722522877514, "eval_corr": -0.016249864219875863, "step": 20}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=55.0, style=ProgressStyle(description_wi…


{"eval_loss": 1.1502182028510355, "eval_pearson": 0.005972333635109359, "eval_spearmanr": 0.009997267686546754, "eval_corr": 0.007984800660828056, "step": 20}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=18.0, style=ProgressStyle(description_wi…


{"eval_loss": 0.8318254517184364, "eval_pearson": 0.0029989208045044153, "eval_spearmanr": 0.0029989208045043867, "eval_corr": 0.002998920804504401, "step": 20}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=2527.0, style=ProgressStyle(description_…

Task IDs: 19it [03:38, 11.49s/it]
Epoch:   0%|          | 0/3 [03:38<?, ?it/s]







KeyboardInterrupt: 

In [35]:
data_args

GlueDataTrainingArguments(task_name='sts-b', data_dir='/home/nlp/data/glue_data/STS-B', max_seq_length=128, overwrite_cache=False)

In [36]:
label_lists

[['0', '1'],
 ['0', '1'],
 ['contradiction', 'entailment', 'neutral'],
 ['0', '1'],
 ['entailment', 'not_entailment'],
 ['0', '1'],
 ['entailment', 'not_entailment'],
 [None]]

In [None]:
# del trainer
# import gc
# gc.collect()
# torch.cuda.empty_cache()