In [2]:
import logging
import os
import argparse

import numpy as np
from torch.utils.data.dataset import Dataset
from transformers.data.processors.utils import InputExample, InputFeatures
from transformers.data.processors.glue import glue_convert_examples_to_features
from transformers.data.processors.utils import DataProcessor
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from sfda.models import sfdaSourceRobertaNegation
from sfda.trainer import sfdaTrainer

In [3]:
labels = ["-1", "1"]
max_length = 128
logger = logging.getLogger(__name__)


Dataset Preprocessing

In [4]:
class NegationDataset(Dataset):
    def __init__(self, features):
        self.features = features
        self.label_list = ["-1", "1"]

    def __len__(self):
        return len(self.features)

    def __getitem__(self, i) -> InputFeatures:
        return self.features[i]

    def get_labels(self):
        return self.label_list

    @classmethod
    def from_tsv(cls, tsv_file, tokenizer):
        """Creates examples for the test set."""
        lines = DataProcessor._read_tsv(tsv_file)
        examples = []
        for (i, line) in enumerate(lines):
            guid = 'instance-%d' % i
            if line[0] in labels:
                text_a = '\t'.join(line[1:])
            else:
                text_a = '\t'.join(line)

            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=None))

        features = glue_convert_examples_to_features(
            examples,
            tokenizer,
            max_length=max_length,
            label_list=labels,
            output_mode='classification',
        )
        return cls(features)

In [5]:
data_file, output_dir = "practice_text/train.tsv", "../outputs/negation/" 

In [6]:
model_name = "tmills/roberta_sfda_sharpseed"# Base Model
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, config=config)
 

In [7]:
model = sfdaSourceRobertaNegation.from_pretrained(model_name,config=config)

Some weights of the model checkpoint at tmills/roberta_sfda_sharpseed were not used when initializing sfdaSourceRobertaNegation: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing sfdaSourceRobertaNegation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing sfdaSourceRobertaNegation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
# create a torch dataset from a tsv file
test_dataset = NegationDataset.from_tsv(data_file, tokenizer)

trainer = sfdaTrainer(
    model=model,
    args=TrainingArguments('save_run/'),
    compute_metrics=None,
)

You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it.


In [9]:
prediction_dict = trainer.predict(test_dataset=test_dataset,ret_feats = True)

HBox(children=(HTML(value='Prediction'), FloatProgress(value=0.0, max=361.0), HTML(value='')))




In [10]:
scores = prediction_dict.predictions
predictions = np.argmax(scores, axis=1)

In [11]:
feat_matrix = prediction_dict.feat_matrix

Saving Outputs

In [12]:
os.makedirs(output_dir, exist_ok=True)
output_test_file = os.path.join(output_dir, 'train_pred.tsv')
feat_matrix_file = os.path.join(output_dir, 'train_scores_and_feat_mat.npy')
with open(output_test_file, "w") as writer:
    logger.info("***** Test results *****")
    for index, item in enumerate(predictions):
        item = test_dataset.get_labels()[item]
#         print("%s\n" % item)
        writer.write("%s\n" % item)
with open(feat_matrix_file,'wb') as file:
    np.save(file,feat_matrix)
    np.save(file,scores)