In [14]:
import re
def get_tokens_with_entities(raw_text: str):
    # split the text by spaces only if the space does not occur between square brackets
    # we do not want to split "multi-word" entity value yet
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))

    return tokens_with_entities

In [15]:
print(get_tokens_with_entities("I come from [Warsaw Natolin,](location) [Poland](location)"))


[('I', 'O'), ('come', 'O'), ('from', 'O'), ('Warsaw', 'B-location'), ('Natolin,', 'I-location'), ('Poland', 'B-location')]


In [16]:
print(get_tokens_with_entities("[FacoryBuyes](brand) [32 cm](Matras_Width) "))


[('FacoryBuyes', 'B-brand'), ('32', 'B-Matras_Width'), ('cm', 'I-Matras_Width'), ('', 'O')]


In [17]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# note that I purposefully misspell Kathmandu to Kathamanduu
sample_input = "I come from [Warsaw Natolin,](location) [Poland](location)"
tokens, entities = list(zip(*get_tokens_with_entities(sample_input)))
tokenized_input = tokenizer(tokens, is_split_into_words=True)
print("Original tokens           : ", tokens)
print("After subword tokenization: ", tokenizer.convert_ids_to_tokens(tokenized_input['input_ids']))

Original tokens           :  ('I', 'come', 'from', 'Warsaw', 'Natolin,', 'Poland')
After subword tokenization:  ['[CLS]', 'i', 'come', 'from', 'warsaw', 'nato', '##lin', ',', 'poland', '[SEP]']
