In [1]:
import numpy as np
import datasets
from transformers import AutoTokenizer, AutoModel

In [2]:
dataset = datasets.load_dataset ("conll2003")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
train_dataset = dataset['train']
train_dataset[0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [5]:
train_dataset.select_columns(['tokens','ner_tags'])[:3]

{'tokens': [['EU',
   'rejects',
   'German',
   'call',
   'to',
   'boycott',
   'British',
   'lamb',
   '.'],
  ['Peter', 'Blackburn'],
  ['BRUSSELS', '1996-08-22']],
 'ner_tags': [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2], [5, 0]]}

In [8]:
train_dataset.features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

Lets Tokenize the dataset

In [14]:
checkpoint  = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [16]:
tokenizer.is_fast

True

In [20]:
train_dataset['tokens'][0]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [26]:
token = tokenizer(train_dataset[0]['tokens'], is_split_into_words=True)
token

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [34]:
token.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

Tokenization breaked words to tokens. Like lamd into la nd ##md. this will disturb our NER and POS tages in dataset.
Lets solve this issue.

In [69]:
tok = train_dataset[0]['tokens']
ner = train_dataset[0]['ner_tags']


In [70]:
tok , ner

(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 [3, 0, 7, 0, 0, 0, 7, 0, 0])

In [139]:
token = tokenizer(tok, is_split_into_words=True)
token

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [72]:
token.tokens(), token.word_ids()

(['[CLS]',
  'EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'la',
  '##mb',
  '.',
  '[SEP]'],
 [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None])

In [141]:
len ( token.tokens() )

12

In [140]:
token.tokens()[0]

'[CLS]'

In [100]:
text = "Apple Inc. is a technology company."
ner_labels = ["B-ORG", "I-ORG", "O", "O", "O", "O", "O"]

# Tokenize the text

tokenss = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))
print (tokenizer.encode(text))
print (tokenizer.decode(tokenizer.encode(text)))
print (tokenizer.tokenize(text))
print (tokenizer(text))
print (tokenss)


tokenizer.convert_ids_to_tokens( tokenizer(text)['input_ids'] )

[101, 7302, 3561, 119, 1110, 170, 2815, 1419, 119, 102]
[CLS] Apple Inc. is a technology company. [SEP]
['Apple', 'Inc', '.', 'is', 'a', 'technology', 'company', '.']
{'input_ids': [101, 7302, 3561, 119, 1110, 170, 2815, 1419, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'Apple', 'Inc', '.', 'is', 'a', 'technology', 'company', '.', '[SEP]']


['[CLS]',
 'Apple',
 'Inc',
 '.',
 'is',
 'a',
 'technology',
 'company',
 '.',
 '[SEP]']

In [83]:
orig_tok = tok
new_tok  = token.tokens()
ner_new = []
word_count = 0
for count in range(len(new_tok)):
    if new_tok[count] in ['[CLS]','[SEP]']:
        ner_new.append(-100)
    elif new_tok[count]:
        pass
    else:
        ner_new.append(0)

print(ner_new)

"""
I have tried to make a logic to compare the string to modeify NER id based on tokenization string but it failed.
There are
"""

[-100, -100]


In [188]:
def tokenization_NER(data):
    #data = train_dataset
    data_tok = tokenizer (data['tokens'], is_split_into_words=True)
    #print ("Token Data: " ,data_tok,"\nner tags: ",data['ner_tags'],"\nword_id: ", data_tok.word_ids())
    #print (train_dataset.features["ner_tags"])
    tok_word_id = data_tok.word_ids()
    NER_old_id = data['ner_tags']
    NER_new_id = []
    temp = None

    for id in tok_word_id:
        if id == None:
            NER_new_id.append(-100)
        elif id == temp:
            #print ("-------",data["ner_tags"][id] ," == ", id)
            if data["ner_tags"][id] % 2 == 1 :
                temp = id
                NER_new_id.append(data["ner_tags"][id] + 1)
            else:
                tag = data["ner_tags"][id]
                tem = -100 if tag == None else tag
                NER_new_id.append (tem)
        else:
            temp = id
            #print ("++++",data["ner_tags"][id] ," == ", id)
            NER_new_id.append(data['ner_tags'][id])

    data_tok["ner_tags"] = NER_new_id
    return data_tok





In [187]:
tokenization_NER(train_dataset[0])

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'ner_tags': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]}


In [189]:
x_dataset =  train_dataset.map(tokenization_NER)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

In [190]:
x_dataset[:5]

{'id': ['0', '1', '2', '3', '4'],
 'tokens': [['EU',
   'rejects',
   'German',
   'call',
   'to',
   'boycott',
   'British',
   'lamb',
   '.'],
  ['Peter', 'Blackburn'],
  ['BRUSSELS', '1996-08-22'],
  ['The',
   'European',
   'Commission',
   'said',
   'on',
   'Thursday',
   'it',
   'disagreed',
   'with',
   'German',
   'advice',
   'to',
   'consumers',
   'to',
   'shun',
   'British',
   'lamb',
   'until',
   'scientists',
   'determine',
   'whether',
   'mad',
   'cow',
   'disease',
   'can',
   'be',
   'transmitted',
   'to',
   'sheep',
   '.'],
  ['Germany',
   "'s",
   'representative',
   'to',
   'the',
   'European',
   'Union',
   "'s",
   'veterinary',
   'committee',
   'Werner',
   'Zwingmann',
   'said',
   'on',
   'Wednesday',
   'consumers',
   'should',
   'buy',
   'sheepmeat',
   'from',
   'countries',
   'other',
   'than',
   'Britain',
   'until',
   'the',
   'scientific',
   'advice',
   'was',
   'clearer',
   '.']],
 'pos_tags': [[22, 42, 16

How hugging face does it. but they have parced my single funtion into 3 sub function

In [143]:

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            print (label)
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [153]:
labels = train_dataset[0]
temp_token = tokenizer(labels['tokens'], is_split_into_words=True)
word_ids = token.word_ids()
print(train_dataset.features['ner_tags'],"\n word_ids", word_ids)
print(align_labels_with_tokens(labels['ner_tags'], word_ids))

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None) 
 word_ids [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]
0
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
