In [1]:
import dataclasses
import logging
import os
from os.path import basename, dirname
import sys
from dataclasses import dataclass, field
from typing import Callable, Dict, Optional, List, Union
from filelock import FileLock
import time
import random
from enum import Enum

import numpy as np

import torch
from torch.utils.data.dataset import Dataset
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, EvalPrediction
from transformers.data.processors.utils import InputFeatures
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.data.metrics import acc_and_f1
from transformers.data.processors.utils import DataProcessor, InputExample, InputFeatures
from transformers.data.processors.glue import glue_convert_examples_to_features

In [2]:
# from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed,
)


logger = logging.getLogger(__name__)


In [3]:
class Split(Enum):
    train = "train"
    dev = "dev"
    test = "test"


In [4]:
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    # Only allowed task is Negation, don't need this field from Glue
    #task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
    data_dir: str = field(
        default= "practice_text/data/",
        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
    )
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )


In [5]:
class NegationProcessor(DataProcessor):
    """ Processor for the sdfa shared task negation datasets """
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(
            tensor_dict["idx"].numpy(),
            tensor_dict["sentence"].numpy().decode("utf-8"),
            None,
            str(tensor_dict["label"].numpy()),
        )

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        """See base class."""
        return ["-1", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training, dev and test sets."""
        test_mode = set_type == "test"
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            if test_mode:
                text_a = line[0]
                label = None
            else:
                # flip the signs so that 1 is negated, that way the f1 calculation is automatically
                # the f1 score for the negated label.
                label = str( -1 * int(line[0]) )
                text_a = '\t'.join(line[1:])
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples


In [6]:
labels = ["-1", "1"]
max_length = 128
from transformers.data.processors.utils import InputExample, InputFeatures
class NegationDataset(Dataset):
    def __init__(self, features):
        self.features = features
        self.label_list = ["-1", "1"]
    def __len__(self):
        return len(self.features)

    def __getitem__(self, i) -> InputFeatures:
        return self.features[i]

    def get_labels(self):
        return self.label_list

    @classmethod
    def from_tsv(cls, tsv_file, tokenizer):
        """Creates examples for the test set."""
        rev_label_list = {"-1":0, "1":1}
        lines = DataProcessor._read_tsv(tsv_file)
#         lab = DataProcessor._read_tsv(pseudo_tsv)
        examples = []
        for (i, line) in enumerate(lines):
            guid = 'instance-%d' % i
            if line[0] in labels:
                label = line[0]
                text_a = '\t'.join(line[1:])
            else:
                text_a = '\t'.join(line)
                label = None

            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))

        features = glue_convert_examples_to_features(
            examples,
            tokenizer,
            max_length=max_length,
            label_list=labels,
            output_mode='classification',
        )
        return cls(features)

In [7]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        default="tmills/roberta_sfda_sharpseed",
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

In [8]:
train_rat = 0.8
dev_rat = 0.1
test_rat = 1.0 -train_rat - dev_rat
assert(train_rat>=0 and train_rat<=1.0)
assert(dev_rat>=0 and dev_rat<=1.0)
assert(test_rat>=0 and test_rat<=1.0)

In [91]:
with open('exp/Data/train_on_dev/train_1.tsv','r') as fil_text:
    arr_text = [lin.strip() for lin in fil_text]
    with open('exp/Data/train_on_dev/dev_labels.txt','r') as fil_labs:
        arr_lab = [lin.strip() for lin in fil_labs]
        all_idx = [i for i in range(len(arr_lab))]
        random.shuffle(all_idx)
        train_idx = all_idx[:int(len(all_idx)*train_rat)]
        dev_idx = all_idx[int(len(all_idx)*train_rat):int(len(all_idx)*(dev_rat+train_rat))]
        test_idx = all_idx[int(len(all_idx)*(dev_rat+train_rat)):]
        with open('exp/Data/train_on_dev/train.tsv','w+',encoding="utf-8-sig") as fil_train:
            for i in train_idx:
                fil_train.write(F'{arr_lab[i]}\t{arr_text[i]}\n')
        with open('exp/Data/train_on_dev/dev.tsv','w+',encoding="utf-8-sig") as fil_dev:
            for i in dev_idx:
                fil_dev.write(F'{arr_lab[i]}\t{arr_text[i]}\n')
        with open('exp/Data/train_on_dev/test.tsv','w+',encoding="utf-8-sig") as fil_test:
            for i in test_idx:
                fil_test.write(F'{arr_text[i]}\t{arr_lab[i]}\n')
        

In [10]:
with open('practice_text/dev.tsv','r') as fil_text:
    arr_text = [lin.strip() for lin in fil_text]
    with open('practice_text/dev_labels.txt','r') as fil_labs:
        arr_lab = [lin.strip() for lin in fil_labs]
        with open('practice_text/data/dev.tsv','w+',encoding="utf-8-sig") as fil_train:
            for i in range(len(arr_lab)):
                fil_train.write(F'{arr_lab[i]}\t{arr_text[i]}\n')

In [26]:
training_args =  TrainingArguments(output_dir = 'models', do_train = True, do_eval = True, do_predict = True , learning_rate  = 5e-6,evaluation_strategy = 'epoch',num_train_epochs= 10)

In [10]:
model_args, data_args = ModelArguments(), DataTrainingArguments()

In [27]:

import shutil


if (
    os.path.exists(training_args.output_dir)
    and os.listdir(training_args.output_dir)
    and training_args.do_train
    and not training_args.overwrite_output_dir
):
    shutil.rmtree(training_args.output_dir)


In [28]:
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)



In [29]:
logger.info("Training/evaluation parameters %s", training_args)

# Set seed
set_seed(training_args.seed)

try:
    num_labels = 2
    output_mode = 'classification'
except KeyError:
    raise ValueError("Task not found: %s" % (data_args.task_name))


10/17/2020 18:47:10 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='models', overwrite_output_dir=False, do_train=True, do_eval=True, do_predict=True, evaluate_during_training=None, evaluation_strategy=<EvaluationStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=5e-06, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10, max_steps=-1, warmup_steps=0, logging_dir='runs/Oct17_18-47-06_prigubot', logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=None, disable_tqdm=False, remo

In [30]:
config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
    config = config
)

In [31]:
tokenizer.__class__

transformers.tokenization_roberta.RobertaTokenizer

In [32]:
def build_compute_metrics_fn() -> Callable[[EvalPrediction], Dict]:
    def compute_metrics_fn(p: EvalPrediction):
        preds = np.argmax(p.predictions, axis=1)
        return acc_and_f1(preds, p.label_ids)

    return compute_metrics_fn

In [33]:
eval_dataset = NegationDataset.from_tsv(tsv_file = os.path.join(data_args.data_dir,"dev.tsv"),tokenizer = tokenizer)
test_dataset =  NegationDataset.from_tsv(tsv_file = os.path.join(data_args.data_dir,"test.tsv"),tokenizer = tokenizer)

In [35]:
conf = 10
    
train_dataset = NegationDataset.from_tsv(tsv_file = F"practice_text/train_entropy_divisions/train_top{conf}.tsv",tokenizer = tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
    )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset = eval_dataset,
    compute_metrics=build_compute_metrics_fn(),
)

# Training
if training_args.do_train:
    trainer.train(
        model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
    )
    trainer.save_model()
    # For convenience, we also re-save the tokenizer to the same directory,
    # so that you can share your model easily on huggingface.co/models =)
    if trainer.is_world_master():
        tokenizer.save_pretrained(training_args.output_dir)



eval_results = {}
if training_args.do_eval:
    logger.info("*** Evaluate ***")

    trainer.compute_metrics = build_compute_metrics_fn()
    eval_result = trainer.evaluate(eval_dataset=eval_dataset)

    output_eval_file = os.path.join(
        training_args.output_dir, f"eval_results.txt"
    )
    if trainer.is_world_master():
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key, value in eval_result.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))

    eval_results.update(eval_result)
    print(F'Top {conf} - {eval_results}')

if training_args.do_predict:
    logging.info("*** Test ***")
    predictions = trainer.predict(test_dataset=test_dataset).predictions
    if output_mode == "classification":
        predictions = np.argmax(predictions, axis=1)

    output_test_file = F"practice_text/train_entropy_divisions/dev_top{conf}.tsv"

    if trainer.is_world_master():
        with open(output_test_file, "w") as writer:
            logger.info("***** Test results *****")
            for index, item in enumerate(predictions):
                item = test_dataset.get_labels()[item]
                writer.write("%s\n" % (item))

Some weights of the model checkpoint at tmills/roberta_sfda_sharpseed were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it.


HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(HTML(value='Iteration'), FloatProgress(value=0.0, max=36.0), HTML(value='')))

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 3.95 GiB total capacity; 2.98 GiB already allocated; 23.44 MiB free; 3.15 GiB reserved in total by PyTorch)