In [64]:
%load_ext autoreload
%autoreload 2

# Load spacy tokenizer
from spacy.lang.es import Spanish

nlp = Spanish()

spacy_tokenizer = nlp.tokenizer

list(spacy_tokenizer("Venga que esto es una risa tío"))

[Venga, que, esto, es, una, risa, tío]

In [65]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "pysentimiento/robertuito-ner"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

In [66]:
sentences = [
    "abran paso al mejor de todos los tiempos, Leonel Andrés messi cuccittini",
    "sos el mejor leo",
]


words = [
    [token.text for token in spacy_tokenizer(sentence)] for sentence in sentences
]
    



In [67]:
import torch 


inputs = tokenizer(words, is_split_into_words=True, padding=True)

model_inputs = {k: torch.tensor(v) for k, v in inputs.items()}

outs = model(**model_inputs)

In [68]:
id2label = model.config.id2label

outputs = torch.argmax(outs.logits, dim=2)



In [69]:
outputs

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 11, 12, 12, 12, 12, 12, 12,  0],
        [ 0,  0,  0,  0, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [70]:
labels = []

for i, (sentence, output) in enumerate(zip(words, outputs)):

    sentence_labels = [None for _ in sentence]
    print(sentence)
    print(output)
    word_ids = inputs.word_ids(i)
    print(word_ids)

    for word_id, label in zip(word_ids, output):
        if word_id is not None and sentence_labels[word_id] is None:
                sentence_labels[word_id] = id2label[label.item()]

    print(list(zip(sentence, sentence_labels)))
    # for word, label in zip(sentence, output):
    #     print(word, id2label[label.item()])

['abran', 'paso', 'al', 'mejor', 'de', 'todos', 'los', 'tiempos', ',', 'Leonel', 'Andrés', 'messi', 'cuccittini']
tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 11, 12, 12, 12, 12, 12, 12,  0])
[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12, None]
[('abran', 'O'), ('paso', 'O'), ('al', 'O'), ('mejor', 'O'), ('de', 'O'), ('todos', 'O'), ('los', 'O'), ('tiempos', 'O'), (',', 'O'), ('Leonel', 'B-PER'), ('Andrés', 'I-PER'), ('messi', 'I-PER'), ('cuccittini', 'I-PER')]
['sos', 'el', 'mejor', 'leo']
tensor([ 0,  0,  0,  0, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
[None, 0, 1, 2, 3, None, None, None, None, None, None, None, None, None, None, None, None, None]
[('sos', 'O'), ('el', 'O'), ('mejor', 'O'), ('leo', 'B-PER')]


## Analyzer

In [74]:
from pysentimiento import create_analyzer
ner_analyzer = create_analyzer("ner", lang="es")

> [0;32m/home/jmperez/projects/pysentimiento/pysentimiento/analyzer.py[0m(401)[0;36mcreate_analyzer[0;34m()[0m
[0;32m    399 [0;31m    [0;32mimport[0m [0mipdb[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    400 [0;31m    [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 401 [0;31m    [0;32mreturn[0m [0manalyzer_class[0m[0;34m.[0m[0mfrom_model_name[0m[0;34m([0m[0mmodel_name[0m[0;34m,[0m [0mtask[0m[0;34m,[0m [0mpreprocessing_args[0m[0;34m,[0m [0mlang[0m[0;34m=[0m[0mlang[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


loading configuration file config.json from cache at /home/jmperez/.cache/huggingface/hub/models--pysentimiento--robertuito-ner/snapshots/c5c1a4673c8e833e9a66b5bf2942988e65349538/config.json
Model config RobertaConfig {
  "_name_or_path": "pysentimiento/robertuito-ner",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-EVENT",
    "2": "I-EVENT",
    "3": "B-GROUP",
    "4": "I-GROUP",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-ORG",
    "8": "I-ORG",
    "9": "B-OTHER",
    "10": "I-OTHER",
    "11": "B-PER",
    "12": "I-PER",
    "13": "B-PROD",
    "14": "I-PROD",
    "15": "B-TIME",
    "16": "I-TIME",
    "17": "B-TITLE",
    "18": "I-TITLE"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,


In [86]:
ner_analyzer.predict(sentences)

> [0;32m/home/jmperez/projects/pysentimiento/pysentimiento/analyzer.py[0m(303)[0;36mdecode[0;34m()[0m
[0;32m    302 [0;31m[0;34m[0m[0m
[0m[0;32m--> 303 [0;31m        [0;32mfor[0m [0msegment[0m [0;32min[0m [0mentities[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    304 [0;31m            segment["text"] = "".join(
[0m
> [0;32m/home/jmperez/projects/pysentimiento/pysentimiento/analyzer.py[0m(303)[0;36mdecode[0;34m()[0m
[0;32m    302 [0;31m[0;34m[0m[0m
[0m[0;32m--> 303 [0;31m        [0;32mfor[0m [0msegment[0m [0;32min[0m [0mentities[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    304 [0;31m            segment["text"] = "".join(
[0m


[[{'type': 'PER',
   'text': 'Leonel Andrés messi cuccittini',
   'start': 42,
   'end': 72}],
 [{'type': 'PER', 'text': 'leo', 'start': 13, 'end': 16}]]