In [1]:
!ls ../../models/

bertweet-hate-speech	     robertuito-irony
robertuito-emotion-analysis  robertuito-lince-ner
robertuito-hate-speech	     robertuito-sentiment-analysis


In [77]:
%load_ext autoreload
%autoreload 2
from transformers import AutoModelForTokenClassification, AutoTokenizer

model = AutoModelForTokenClassification.from_pretrained("../../models/robertuito-lince-ner")

tokenizer = AutoTokenizer.from_pretrained("../../models/robertuito-lince-ner")
tokenizer.model_max_length = 128

loading configuration file ../../models/robertuito-lince-ner/config.json
Model config RobertaConfig {
  "_name_or_path": "../../models/robertuito-lince-ner",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-EVENT",
    "2": "I-EVENT",
    "3": "B-GROUP",
    "4": "I-GROUP",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-ORG",
    "8": "I-ORG",
    "9": "B-OTHER",
    "10": "I-OTHER",
    "11": "B-PER",
    "12": "I-PER",
    "13": "B-PROD",
    "14": "I-PROD",
    "15": "B-TIME",
    "16": "I-TIME",
    "17": "B-TITLE",
    "18": "I-TITLE"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-EVENT": 1,
    "B-GROUP": 3,
    "B-LOC": 5,
    "B-ORG": 7,
    "B-OTHER": 9,
    "B-PER"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


All model checkpoint weights were used when initializing RobertaForTokenClassification.

All the weights of RobertaForTokenClassification were initialized from the model checkpoint at ../../models/robertuito-lince-ner.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForTokenClassification for predictions without further training.
Didn't find file ../../models/robertuito-lince-ner/added_tokens.json. We won't load it.
loading file None
loading file ../../models/robertuito-lince-ner/special_tokens_map.json
loading file ../../models/robertuito-lince-ner/tokenizer_config.json
loading file ../../models/robertuito-lince-ner/tokenizer.json


In [78]:
def load_conll(path, lang="es"):
    """
    Loads CoNLL-2003 dataset
    """
    with open(path) as f:
        lines = f.read().splitlines()
    data = []
    current_line = []
    for line in lines:
        line = line.strip()
        if line == "":
            data.append(current_line)
            current_line = []
        else:
            current_line.append(line.split("\t"))
    return data

test_path = "../../data/lince/ner_spaeng/test.conll"

test_data = load_conll(test_path)

len(test_data)

23527

In [104]:
from datasets import Dataset
from pysentimiento.lince.ner import preprocess_token, tokenize_and_align_labels

words = [[x[0] for x in sentence] for sentence in test_data]
ner = [[None] * len(x) for x in test_data]

for w, l in zip(words, ner):
    assert len(w) == len(l)

test_dataset = Dataset.from_dict(
    {"words": words, "labels": ner}
)

test_dataset = test_dataset.map(
    lambda x: {
        "words": [preprocess_token(word, "es") for word in x["words"]]
    }
)

tokenize_fun = lambda x: tokenize_and_align_labels(x, tokenizer)
test_dataset = test_dataset.map(
    tokenize_fun, batched=True, batch_size=32, 
)

test_dataset = test_dataset.remove_columns(["labels"])

  0%|          | 0/23527 [00:00<?, ?ex/s]

  0%|          | 0/736 [00:00<?, ?ba/s]

In [105]:
from transformers import DataCollatorForTokenClassification, Trainer, TrainingArguments
data_collator = DataCollatorForTokenClassification(tokenizer)

train_args = TrainingArguments(
    output_dir="./test/",
    per_device_eval_batch_size=32,
    do_train=False,
)

trainer_args = {
    "model": model,
    "args": train_args,
    "eval_dataset": test_dataset,
    "data_collator": data_collator,
    "tokenizer": tokenizer,
}

eval_trainer = Trainer(**trainer_args)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [106]:
ret = eval_trainer.predict(test_dataset)

The following columns in the test set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: words, word_ids.
***** Running Prediction *****
  Num examples = 23527
  Batch size = 32


In [123]:
from pysentimiento.lince.ner import id2label, label2id

idx = 103

word_ids = test_dataset[idx]["word_ids"]
input_ids = test_dataset[idx]["input_ids"]
preds = ret.predictions[idx]

assert len(word_ids) == len(input_ids)

current_word_id = None
for word_id, token_id, pred in zip(word_ids, input_ids, preds):
    token = tokenizer.decode(token_id)
    print(token)
    if current_word_id != word_id:
        current_word_id = word_id
        label = id2label[pred.argmax()]
        print(label)

<s>
"
O
@usuario
O
:
O
dia
O
internacional
O
de
O
coger
O
le
las
O
nalgas
O
a
O
angie
B-PER
.
O
"
O
guil
O
ty
..
O
</s>
O
