In [1]:
!ls ../../models/

bertweet-hate-speech	     robertuito-irony
robertuito-emotion-analysis  robertuito-lince-ner
robertuito-hate-speech	     robertuito-sentiment-analysis


In [44]:
%load_ext autoreload
%autoreload 2
from transformers import AutoModelForTokenClassification, AutoTokenizer
from pysentimiento.lince.ner import load_datasets

model = AutoModelForTokenClassification.from_pretrained("../../models/robertuito-lince-ner")


tokenizer = AutoTokenizer.from_pretrained("../../models/robertuito-lince-ner")
tokenizer.model_max_length = 128

_, _, test_dataset = load_datasets(lang="es", preprocess=False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Reusing dataset lince (/home/jmperez/.cache/huggingface/datasets/lince/ner_spaeng/1.0.0/10d41747f55f0849fa84ac579ea1acfa7df49aa2015b60426bc459c111b3d589)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/jmperez/.cache/huggingface/datasets/lince/ner_spaeng/1.0.0/10d41747f55f0849fa84ac579ea1acfa7df49aa2015b60426bc459c111b3d589/cache-f749fd5ca349b1d8.arrow
Loading cached processed dataset at /home/jmperez/.cache/huggingface/datasets/lince/ner_spaeng/1.0.0/10d41747f55f0849fa84ac579ea1acfa7df49aa2015b60426bc459c111b3d589/cache-875716555d36ff1f.arrow
Loading cached processed dataset at /home/jmperez/.cache/huggingface/datasets/lince/ner_spaeng/1.0.0/10d41747f55f0849fa84ac579ea1acfa7df49aa2015b60426bc459c111b3d589/cache-1afcf30fe2e4ebdb.arrow


In [45]:
from datasets import Dataset
from pysentimiento.lince.ner import preprocess_token, tokenize_and_align_labels

original_words = test_dataset["words"]

test_dataset = test_dataset.map(
    lambda x: {
        "words": [preprocess_token(word, "es") for word in x["words"]]
    }
)


  0%|          | 0/23527 [00:00<?, ?ex/s]

In [46]:
list(zip(original_words[0], test_dataset["words"][0]))

[('"', '"'),
 ('@PattyB_14', '@usuario'),
 (':', ':'),
 ('Este', 'Este'),
 ('weekend', 'weekend'),
 ('es', 'es'),
 ('largo', 'largo'),
 ('!', '!'),
 ('A', 'A'),
 ('celebrar', 'celebrar'),
 ('mi', 'mi'),
 ('bday', 'bday'),
 ('allllllll', 'alll'),
 ('weeeekend', 'weeekend'),
 ('looooooong', 'looong'),
 ('"', '"'),
 ('#PARY', '#PARY'),
 ('wuutt', 'wuutt'),
 ('wutt', 'wutt'),
 ('!', '!')]

In [47]:
from tqdm.auto import tqdm
from transformers import pipeline

pipe = pipeline("ner", model=model, tokenizer=tokenizer)
idx = 103
ner = []


In [48]:

tokenize_fun = lambda x: tokenize_and_align_labels(x, tokenizer)
test_dataset = test_dataset.map(
    tokenize_fun, batched=True, batch_size=32, 
)

test_dataset = test_dataset.remove_columns(["labels"])

  0%|          | 0/736 [00:00<?, ?ba/s]

In [49]:
from tqdm.auto import tqdm

problematic_instances = []

for idx, row in tqdm(enumerate(test_dataset), total=len(test_dataset)):
    word_ids = row["word_ids"]
    input_ids = row["input_ids"]
    words = original_words[idx]
    if not (word_ids[-2] + 1) == len(words):
        problematic_instances.append(idx)

print(f"{len(problematic_instances)} problematic instances")

  0%|          | 0/23527 [00:00<?, ?it/s]

0 problematic instances


In [50]:
from transformers import DataCollatorForTokenClassification, Trainer, TrainingArguments
data_collator = DataCollatorForTokenClassification(tokenizer)

train_args = TrainingArguments(
    output_dir="./test/",
    per_device_eval_batch_size=32,
    do_train=False,
)

trainer_args = {
    "model": model,
    "args": train_args,
    "eval_dataset": test_dataset,
    "data_collator": data_collator,
    "tokenizer": tokenizer,
}

eval_trainer = Trainer(**trainer_args)

In [51]:
ret = eval_trainer.predict(test_dataset)

The following columns in the test set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner, word_ids, idx, words, lid.
***** Running Prediction *****
  Num examples = 23527
  Batch size = 32


In [54]:
from tqdm.auto import tqdm
from pysentimiento.lince.ner import id2label, label2id

outputs = []

for idx, row in tqdm(enumerate(test_dataset), total=len(test_dataset)):
    word_ids = row["word_ids"]
    input_ids = row["input_ids"]
    words = original_words[idx]
    assert (word_ids[-2] + 1) == len(words)
    preds = ret.predictions[idx]

    sentence_output = []
    current_word_id = None

    for word_id, token_id, pred in zip(word_ids, input_ids, preds):
        token = tokenizer.decode(token_id)
        if current_word_id != word_id and word_id is not None:
            current_word_id = word_id
            label = id2label[pred.argmax()]
            sentence_output.append([words[current_word_id], label])
    outputs.append(sentence_output)


  0%|          | 0/23527 [00:00<?, ?it/s]

In [57]:
outputs[0]

[['"', 'O'],
 ['@PattyB_14', 'O'],
 [':', 'O'],
 ['Este', 'O'],
 ['weekend', 'O'],
 ['es', 'O'],
 ['largo', 'O'],
 ['!', 'O'],
 ['A', 'O'],
 ['celebrar', 'O'],
 ['mi', 'O'],
 ['bday', 'O'],
 ['allllllll', 'O'],
 ['weeeekend', 'O'],
 ['looooooong', 'O'],
 ['"', 'O'],
 ['#PARY', 'O'],
 ['wuutt', 'O'],
 ['wutt', 'O'],
 ['!', 'O']]

In [60]:
def write_conll(path, data):
    with open(path, "w") as f:
        for sentence in data:
            for row in sentence:
                line = "\t".join(row)
                f.write(f"{line}\n")
            f.write("\n")

write_conll("ner_spa_eng.txt", [[[x[1]] for x in instance] for instance in outputs])