# First Sub-token Labeling:

    Assign the slot label of the original word only to the first sub-token. The subsequent sub-tokens can be assigned a special "X" label or ignored during the evaluation and loss calculation.
    Example:
        Original Sentence: "I want to book an airplane ticket"
        Tokenized: ["I", "want", "to", "book", "an", "air", "##plane", "ticket"]
        Slot Labels: ["O", "O", "O", "O", "O", "B-transport", "X", "O"]

Replicating Slot Labels:

    Assign the slot label of the original word to all its sub-tokens.
    Example:
        Original Sentence: "I want to book an airplane ticket"
        Tokenized: ["I", "want", "to", "book", "an", "air", "##plane", "ticket"]
        Slot Labels: ["O", "O", "O", "O", "O", "B-transport", "B-transport", "O"]

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence.split(), text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] + ["X"] * (n_subwords - 1))

    return tokenized_sentence, labels

sentence = "I want to book an airplane ticket airfare"
labels = ["O", "O", "O", "O", "O", "B-transport", "O", "pizza"]
# sentence = "what is the cost for these flights from baltimore to philadelphia"
# label


tokenized_sentence, updated_labels = tokenize_and_preserve_labels(sentence, labels)

input_ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)

print("Tokenized Sentence:", tokenized_sentence)
print("Updated Labels:", updated_labels)
print("Input IDs:", input_ids)

Tokenized Sentence: ['i', 'want', 'to', 'book', 'an', 'airplane', 'ticket', 'air', '##fare']
Updated Labels: ['O', 'O', 'O', 'O', 'O', 'B-transport', 'O', 'pizza', 'X']
Input IDs: [1045, 2215, 2000, 2338, 2019, 13297, 7281, 2250, 17883]


In [5]:
tokenizer.encode_plus(['i', 'want', 'to', 'book', 'an', 'airplane', 'ticket', 'air', '##fare'], add_special_tokens=False)

{'input_ids': [1045, 2215, 2000, 2338, 2019, 13297, 7281, 2250, 17883], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [34]:
tokenizer.decode([101, 1045, 2215, 2000, 2338, 2019, 13297, 7281, 2250, 17883, 102, 0])

'[CLS] i want to book an airplane ticket airfare [SEP] [PAD]'