✏️ **Your turn!** Print the same two sentences with their POS or chunking labels.

(This exercise is from https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt)

In [1]:
!pip install -q datasets transformers[sentencepiece]

If running this notebook in Colab, please ensure that your Hugging Face `HF_TOKEN` is added to your Colab secrets.

Alternatively, please login to Hugging Face by running the following cell.

In [2]:
# !huggingface-cli login

In [3]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
example0 = raw_datasets['train'][0]
for k, v in example0.items():
    print(f"{k}: {v}")

id: 0
tokens: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
pos_tags: [22, 42, 16, 21, 35, 37, 16, 21, 7]
chunk_tags: [11, 21, 11, 12, 21, 22, 11, 12, 0]
ner_tags: [3, 0, 7, 0, 0, 0, 7, 0, 0]


In [5]:
example4 = raw_datasets['train'][4]
for k, v in example4.items():
    print(f"{k}: {v}")

id: 4
tokens: ['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']
pos_tags: [22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 22, 38, 15, 22, 24, 20, 37, 21, 15, 24, 16, 15, 22, 15, 12, 16, 21, 38, 17, 7]
chunk_tags: [11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 12, 21, 13, 11, 12, 21, 22, 11, 13, 11, 1, 13, 11, 17, 11, 12, 12, 21, 1, 0]
ner_tags: [5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0]


In [6]:
features = raw_datasets['train'].features
features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [7]:
chunk_feature = features['chunk_tags']
chunk_feature

Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None)

In [8]:
label_names = chunk_feature.feature.names
label_names

['O',
 'B-ADJP',
 'I-ADJP',
 'B-ADVP',
 'I-ADVP',
 'B-CONJP',
 'I-CONJP',
 'B-INTJ',
 'I-INTJ',
 'B-LST',
 'I-LST',
 'B-NP',
 'I-NP',
 'B-PP',
 'I-PP',
 'B-PRT',
 'I-PRT',
 'B-SBAR',
 'I-SBAR',
 'B-UCP',
 'I-UCP',
 'B-VP',
 'I-VP']

`'B-ADJP'` stands for "beginning of adjective phrase", `'I-ADJP'` stands for "inside adjective phrase", `'B-ADVP'` stands for "beginning of adverb phrase", `'I-ADVP'` stands for "inside adverb phrase", and so on...

In [9]:
len(label_names)

23

In [10]:
words = example0['tokens']
labels = example0['chunk_tags']
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

EU   rejects German call to   boycott British lamb . 
B-NP B-VP    B-NP   I-NP B-VP I-VP    B-NP    I-NP O 


In [11]:
words = example4['tokens']
labels = example4['chunk_tags']
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

Germany 's   representative to   the  European Union 's   veterinary committee Werner Zwingmann said on   Wednesday consumers should buy  sheepmeat from countries other  than Britain until  the  scientific advice was  clearer . 
B-NP    B-NP I-NP           B-PP B-NP I-NP     I-NP  B-NP I-NP       I-NP      I-NP   I-NP      B-VP B-PP B-NP      I-NP      B-VP   I-VP B-NP      B-PP B-NP      B-ADJP B-PP B-NP    B-SBAR B-NP I-NP       I-NP   B-VP B-ADJP  O 


✏️ **Your turn!** Some researchers prefer to attribute only one label per word, and assign `-100` to the other subtokens in a given word. This is to avoid long words that split into lots of subtokens contributing heavily to the loss. Change the previous function to align labels with input IDs by following this rule.

(This exercise is also from https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt)

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer.is_fast, tokenizer.model_max_length

(True, 512)

In [13]:
raw_datasets = raw_datasets.remove_columns(['id', 'pos_tags', 'chunk_tags'])
raw_datasets = raw_datasets.rename_column('tokens', 'words')
raw_datasets = raw_datasets.rename_column('ner_tags', 'labels')
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['words', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['words', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['words', 'labels'],
        num_rows: 3453
    })
})

In [14]:
examples = raw_datasets['train'][:2]
for k, v in examples.items():
    print(f"{k}: {v}")

words: [['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn']]
labels: [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2]]


In [15]:
encoding = tokenizer(examples['words'], is_split_into_words=True, truncation=True, max_length=512)
encoding

{'input_ids': [[101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], [101, 1943, 14428, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]]}

In [16]:
len(examples['words'][0]), len(encoding['input_ids'][0])

(9, 12)

In [17]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word_id = None
    for word_id in word_ids:
        # This loop goes over each word ID within an example.
        if word_id is None:
            # Special token...
            new_labels.append(-100)
        else:
            if word_id != current_word_id:
                # Start of a new word...
                label = labels[word_id]
                new_labels.append(label)
            else:
                # Continuation of the same word...
                new_labels.append(-100)
        current_word_id = word_id
    return new_labels

In [18]:
print(examples['words'][0])
print(examples['labels'][0])

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
[3, 0, 7, 0, 0, 0, 7, 0, 0]


In [19]:
# Test:
print(encoding.tokens(0))
print(align_labels_with_tokens(examples['labels'][0], encoding.word_ids(0)))

['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']
[-100, 3, 0, 7, 0, 0, 0, 7, 0, -100, 0, -100]
