In [7]:
!ls ../../models/

bertweet-hate-speech	     robertuito-irony
robertuito-emotion-analysis  robertuito-lince-ner
robertuito-hate-speech	     robertuito-sentiment-analysis


In [1]:
%load_ext autoreload
%autoreload 2
from transformers import AutoModelForTokenClassification, AutoTokenizer

model = AutoModelForTokenClassification.from_pretrained("../../models/robertuito-lince-ner")

tokenizer = AutoTokenizer.from_pretrained("../../models/robertuito-lince-ner")
tokenizer.model_max_length = 128

In [2]:
def load_conll(path, lang="es"):
    """
    Loads CoNLL-2003 dataset
    """
    with open(path) as f:
        lines = f.read().splitlines()
    data = []
    current_line = []
    for line in lines:
        line = line.strip()
        if line == "":
            data.append(current_line)
            current_line = []
        else:
            current_line.append(line.split("\t"))
    return data

test_path = "../../data/lince/ner_spaeng/test.conll"

test_data = load_conll(test_path)

len(test_data)

23527

In [3]:
!head -n 40 $test_path

"	other
@PattyB_14	other
:	other
Este	lang2
weekend	lang1
es	lang2
largo	lang2
!	other
A	lang2
celebrar	lang2
mi	lang2
bday	lang1
allllllll	lang1
weeeekend	lang1
looooooong	lang1
"	other
#PARY	lang1
wuutt	lang1
wutt	lang1
!	other

odio	lang2
el	lang2
proceso	lang2
de	lang2
preparación	lang2

@esangiecarajo	other
no	lang2
.	other
y	lang2
no	lang2

las	lang2
0	other
ganas	lang2
mia	lang2
de	lang2
no	lang2
hacer	lang2


In [4]:
if not '':
    print("TUGO")

TUGO


In [15]:
from datasets import Dataset
from pysentimiento.lince.ner import preprocess_token, tokenize_and_align_labels

original_words = [[x[0] for x in sentence] for sentence in test_data]
original_langs = [[x[1] for x in sentence] for sentence in test_data]
ner = [[None] * len(x) for x in test_data]

for w, l in zip(original_words, ner):
    assert len(w) == len(l)

test_dataset = Dataset.from_dict(
    {"words": original_words, "lang": original_langs, "labels": ner}
)


In [22]:
non_processed_words = test_dataset[479]["words"]
print(" ".join(non_processed_words))
[preprocess_token(word, "es", char_replace=False) for word in non_processed_words]

makeupbymariajose 's photo https://t.co/LvRc5JJxvJ denle follow para citas de maquillaje ☺ ️


['makeupbymariajose',
 "'s",
 'photo',
 'https://url',
 'denle',
 'follow',
 'para',
 'citas',
 'de',
 'maquillaje',
 '☺',
 '️']

In [26]:
non_processed_words[3] == "https://t.co/LvRc5JJxvJ"

True

In [28]:
preprocess_token(non_processed_words[3], "es", char_replace=False)

'https://url'

In [27]:
from pysentimiento.preprocessing import url_regex

url_regex.sub(non_processed_words[3], "url")

'url'

In [21]:
from pysentimiento.preprocessing import url_regex
tok = non_processed_words[-1]



'️'

In [None]:

test_dataset = test_dataset.map(
    lambda x: {
        "words": [preprocess_token(word, "es", char_replace=False) for word in x["words"]]
    }
)


In [14]:
test_dataset[479]

{'words': ['makeupbymariajose',
  "'s",
  'photo',
  'https://url',
  'denle',
  'follow',
  'para',
  'citas',
  'de',
  'maquillaje',
  '☺',
  '️'],
 'lang': ['ne',
  'lang1',
  'lang1',
  'other',
  'lang2',
  'lang1',
  'lang2',
  'lang2',
  'lang2',
  'lang2',
  'other',
  'other'],
 'labels': [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None]}

In [12]:

tokenize_fun = lambda x: tokenize_and_align_labels(x, tokenizer)
test_dataset = test_dataset.map(
    tokenize_fun, batched=True, batch_size=32, 
)

test_dataset = test_dataset.remove_columns(["labels"])

  0%|          | 0/736 [00:00<?, ?ba/s]

In [7]:
from tqdm.auto import tqdm

for idx, row in tqdm(enumerate(test_dataset), total=len(test_dataset)):
    word_ids = row["word_ids"]
    input_ids = row["input_ids"]
    words = original_words[idx]
    assert (word_ids[-2] + 1) == len(words)

  0%|          | 0/23527 [00:00<?, ?it/s]

AssertionError: 

In [10]:
idx, row["words"]

(479,
 ['makeupbymariajose',
  "'s",
  'photo',
  'https://url',
  'denle',
  'follow',
  'para',
  'citas',
  'de',
  'maquillaje',
  '☺',
  '️'])

In [22]:
from transformers import DataCollatorForTokenClassification, Trainer, TrainingArguments
data_collator = DataCollatorForTokenClassification(tokenizer)

train_args = TrainingArguments(
    output_dir="./test/",
    per_device_eval_batch_size=32,
    do_train=False,
)

trainer_args = {
    "model": model,
    "args": train_args,
    "eval_dataset": test_dataset,
    "data_collator": data_collator,
    "tokenizer": tokenizer,
}

eval_trainer = Trainer(**trainer_args)

In [23]:
ret = eval_trainer.predict(test_dataset)

The following columns in the test set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: word_ids, words, lang.
***** Running Prediction *****
  Num examples = 23527
  Batch size = 32


In [30]:
word_ids[-2], len(words)

(10, 12)

In [24]:
from tqdm.auto import tqdm
from pysentimiento.lince.ner import id2label, label2id

outputs = []

for idx, row in tqdm(enumerate(test_dataset), total=len(test_dataset)):
    word_ids = row["word_ids"]
    input_ids = row["input_ids"]
    words = original_words[idx]
    assert (word_ids[-2] + 1) == len(words)
    preds = ret.predictions[idx]

    sentence_output = []
    current_word_id = None

    for word_id, token_id, pred in zip(word_ids, input_ids, preds):
        token = tokenizer.decode(token_id)
        if current_word_id != word_id and word_id is not None:
            current_word_id = word_id
            label = id2label[pred.argmax()]
            sentence_output.append([words[current_word_id], label])
    outputs.append(sentence_output)


  0%|          | 0/23527 [00:00<?, ?it/s]

AssertionError: 

In [25]:
words, row["words"]

(['makeupbymariajose',
  "'s",
  'photo',
  'https://t.co/LvRc5JJxvJ',
  'denle',
  'follow',
  'para',
  'citas',
  'de',
  'maquillaje',
  '☺',
  '️'],
 ['makeupbymariajose',
  "'s",
  'photo',
  'https://url',
  'denle',
  'follow',
  'para',
  'citas',
  'de',
  'maquillaje',
  '☺',
  '️'])

In [32]:
def write_conll(path, data):
    with open(path, "w") as f:
        for sentence in data:
            for row in sentence:
                line = "\t".join(row)
                f.write(f"{line}\n")
            f.write("\n")

write_conll("test.conll", outputs)