## Inference NER model

In [1]:
from transformers import AutoModelForTokenClassification, AutoTokenizer


model_path = ".."

!ls

Inference.ipynb  ner  ner_spa_eng.txt  pos  sentiment  submissions


In [4]:
!ls ../../models/robertuito-lince-ner-uncased

config.json		 test_results.json	training_args.bin
pytorch_model.bin	 tokenizer.json
special_tokens_map.json  tokenizer_config.json


In [5]:
model_path = "../../models/robertuito-lince-ner-uncased"

model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [28]:
from transformers import pipeline

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first")

In [44]:
import torch

def label_words(text, model, tokenizer):
    """
    Returns decoded labels

    Arguments:
    ----------

    model: AutoModelForTokenClassification
        Labeling model
    tokenizer: AutoTokenizer
        Tokenizer
    text: str or list of str
        Text to be labeled

    Returns:
    --------

    word, labels: tuple of list of str
        Words and their respective labels
        labels are one of
            "O",
            "B-marker", "I-marker"
            "B-reference", "I-reference"
            "B-term", "I-term"
    """


    is_split_into_words = type(text) is list

    inputs = tokenizer(
        text, return_tensors="pt",
        truncation=True,
    )

    word_ids = inputs.word_ids()

    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2).view(-1)

    id2label = model.config.id2label

    current_word_id = None
    current_label = None

    word_and_labels = []
    for word_id, label in zip(word_ids, predictions):
        label = label.item()
        label_name = id2label[label]
        if word_id != current_word_id:
            # Starts new word
            if current_word_id is not None:
                if is_split_into_words:
                    word = text[current_word_id]
                else:
                    word_span = inputs.word_to_chars(current_word_id)
                    word = text[word_span[0]:word_span[1]]

                word_and_labels.append((word.strip(), current_label))

            current_label = label_name
            current_word_id = word_id

    if current_word_id:
        word_and_labels.append((word, current_label))
    return word_and_labels


In [45]:
label_words("El presidente de la República es una persona", model, tokenizer)

[('El', 'O'),
 ('presidente', 'O'),
 ('de', 'O'),
 ('la', 'O'),
 ('República', 'O'),
 ('es', 'O'),
 ('una', 'O'),
 ('persona', 'O')]

In [46]:
label_words("El presidente de la República es John Wayne y es dueño de GreenTugo", model, tokenizer)

[('El', 'O'),
 ('presidente', 'O'),
 ('de', 'O'),
 ('la', 'O'),
 ('República', 'O'),
 ('es', 'O'),
 ('John', 'B-PER'),
 ('Wayne', 'I-PER'),
 ('y', 'O'),
 ('es', 'O'),
 ('dueño', 'O'),
 ('de', 'O'),
 ('GreenTugo', 'B-ORG')]

In [57]:
label_words("Esto es Tugolandia", model, tokenizer)

[('Esto', 'O'), ('es', 'O'), ('Tugolandia', 'B-LOC')]

In [70]:
def bio_to_segments(word_and_labels):
    """
    Convert BIO labels to segments

    Arguments:
    ----------

    word_and_labels: list of tuple of (spacy.Token, str)
        The word and label pairs.

    Returns:
    --------

    segments: list of dicts
        The segments.
    """
    ret = []
    current_words = None
    current_type = None
    for word, label in word_and_labels:
        if label == 'O':
            if current_words:
                ret.append({
                    "tokens": current_words,
                    "type": current_type
                })
            current_type = None
            current_words = None
        elif label.startswith('B-'):
            if current_words:
                ret.append({
                    "tokens": current_words,
                    "type": current_type
                })
            current_words = [word]
            current_type = label[2:]
        elif label.startswith('I-'):
            # If we are in the same type, add the word
            if not current_words:
                current_words = [word]
                current_type = label[2:]
            # Ignoring type... this could be a possible error
            current_words.append(word)
            

    if current_words:
        ret.append({
            "tokens": current_words,
            "type": current_type
        })

    for segment in ret:
        segment["text"] = " ".join(segment["tokens"])
    return ret

def detect_entities(text):
    """
    Detect entities in text

    Arguments:
    ----------

    text: str
        Text to be labeled

    Returns:
    --------

    segments: list of dicts
        The segments.
    """
    word_and_labels = label_words(text, model, tokenizer)
    segments = bio_to_segments(word_and_labels)
    return segments

In [71]:
detect_entities("My name is John Wayne y soy el presidente de Uruguay, pedazo de cabrón")

[{'tokens': ['John', 'Wayne'], 'type': 'PER', 'text': 'John Wayne'},
 {'tokens': ['Uruguay,'], 'type': 'LOC', 'text': 'Uruguay,'}]

In [76]:
label_words("Me llamo Juan Pablo, me gusta ir a 25 de Mayo", model, tokenizer)

[('Me', 'O'),
 ('llamo', 'O'),
 ('Juan', 'B-PER'),
 ('Pablo,', 'I-PER'),
 ('me', 'O'),
 ('gusta', 'O'),
 ('ir', 'O'),
 ('a', 'O'),
 ('25', 'B-LOC'),
 ('de', 'I-LOC'),
 ('Mayo', 'I-LOC')]