In [1]:
import numpy as np
import datasets
from transformers import AutoTokenizer, AutoModel

In [2]:
dataset = datasets.load_dataset ("conll2003")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
train_dataset = dataset['train']
train_dataset[0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [5]:
train_dataset.select_columns(['tokens','ner_tags'])[:3]

{'tokens': [['EU',
   'rejects',
   'German',
   'call',
   'to',
   'boycott',
   'British',
   'lamb',
   '.'],
  ['Peter', 'Blackburn'],
  ['BRUSSELS', '1996-08-22']],
 'ner_tags': [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2], [5, 0]]}

In [6]:
train_dataset.features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

Lets Tokenize the dataset

In [7]:
checkpoint  = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint+ r"\runs\Sep20_20-50-18_PKL-SAFAHM6Q-LT")
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased\runs\Sep20_20-50-18_PKL-SAFAHM6Q-LT', vocab_size=28996, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [8]:
tokenizer.is_fast

True

In [9]:
train_dataset['tokens'][0]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [10]:
token = tokenizer(train_dataset[0]['tokens'], is_split_into_words=True)
token

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
token.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

Tokenization breaked words to tokens. Like lamd into la nd ##md. this will disturb our NER and POS tages in dataset.
Lets solve this issue.

In [12]:
tok = train_dataset[0]['tokens']
ner = train_dataset[0]['ner_tags']


In [13]:
tok , ner

(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 [3, 0, 7, 0, 0, 0, 7, 0, 0])

In [14]:
token = tokenizer(tok, is_split_into_words=True)
token

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
token.tokens(), token.word_ids()

(['[CLS]',
  'EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'la',
  '##mb',
  '.',
  '[SEP]'],
 [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None])

In [16]:
len ( token.tokens() )

12

In [17]:
token.tokens()[0]

'[CLS]'

In [18]:
text = "Apple Inc. is a technology company."
ner_labels = ["B-ORG", "I-ORG", "O", "O", "O", "O", "O"]

# Tokenize the text

tokenss = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))
print (tokenizer.encode(text))
print (tokenizer.decode(tokenizer.encode(text)))
print (tokenizer.tokenize(text))
print (tokenizer(text))
print (tokenss)


tokenizer.convert_ids_to_tokens( tokenizer(text)['input_ids'] )

[101, 7302, 3561, 119, 1110, 170, 2815, 1419, 119, 102]
[CLS] Apple Inc. is a technology company. [SEP]
['Apple', 'Inc', '.', 'is', 'a', 'technology', 'company', '.']
{'input_ids': [101, 7302, 3561, 119, 1110, 170, 2815, 1419, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'Apple', 'Inc', '.', 'is', 'a', 'technology', 'company', '.', '[SEP]']


['[CLS]',
 'Apple',
 'Inc',
 '.',
 'is',
 'a',
 'technology',
 'company',
 '.',
 '[SEP]']

In [19]:
orig_tok = tok
new_tok  = token.tokens()
ner_new = []
word_count = 0
for count in range(len(new_tok)):
    if new_tok[count] in ['[CLS]','[SEP]']:
        ner_new.append(-100)
    elif new_tok[count]:
        pass
    else:
        ner_new.append(0)

print(ner_new)

"""
I have tried to make a logic to compare the string to modeify NER id based on tokenization string but it failed.
There are
"""

[-100, -100]


'\nI have tried to make a logic to compare the string to modeify NER id based on tokenization string but it failed.\nThere are\n'

In [20]:
def tokenization_NER(data):
    #data = train_dataset
    data_tok = tokenizer (data['tokens'], is_split_into_words=True)
    #print ("Token Data: " ,data_tok,"\nner tags: ",data['ner_tags'],"\nword_id: ", data_tok.word_ids())
    #print (train_dataset.features["ner_tags"])
    tok_word_id = data_tok.word_ids()
    NER_old_id = data['ner_tags']
    NER_new_id = []
    temp = None

    for id in tok_word_id:
        if id == None:
            NER_new_id.append(-100)
        elif id == temp:
            #print ("-------",data["ner_tags"][id] ," == ", id)
            if data["ner_tags"][id] % 2 == 1 :
                temp = id
                NER_new_id.append(data["ner_tags"][id] + 1)
            else:
                tag = data["ner_tags"][id]
                tem = -100 if tag == None else tag
                NER_new_id.append (tem)
        else:
            temp = id
            #print ("++++",data["ner_tags"][id] ," == ", id)
            NER_new_id.append(data['ner_tags'][id])

    data_tok["labels"] = NER_new_id
    return data_tok





In [21]:
tokenization_NER(train_dataset[0])

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]}

In [22]:
x_dataset =  train_dataset.map(tokenization_NER, remove_columns=train_dataset.column_names)

In [23]:
x_dataset[:5]

{'input_ids': [[101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  [101, 1943, 14428, 102],
  [101, 26660, 13329, 12649, 15928, 1820, 118, 4775, 118, 1659, 102],
  [101,
   1109,
   1735,
   2827,
   1163,
   1113,
   9170,
   1122,
   19786,
   1114,
   1528,
   5566,
   1106,
   11060,
   1106,
   188,
   17315,
   1418,
   2495,
   12913,
   1235,
   6479,
   4959,
   2480,
   6340,
   13991,
   3653,
   1169,
   1129,
   12086,
   1106,
   8892,
   119,
   102],
  [101,
   1860,
   112,
   188,
   4702,
   1106,
   1103,
   1735,
   1913,
   112,
   188,
   27431,
   3914,
   14651,
   163,
   7635,
   4119,
   1163,
   1113,
   9031,
   11060,
   1431,
   4417,
   8892,
   3263,
   2980,
   1121,
   2182,
   1168,
   1190,
   2855,
   1235,
   1103,
   3812,
   5566,
   1108,
   27830,
   119,
   102]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0,
   0,


How hugging face does it. but they have parced my single funtion into 3 sub function

In [24]:

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            print (label)
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [25]:
labels = train_dataset[0]
temp_token = tokenizer(labels['tokens'], is_split_into_words=True)
word_ids = token.word_ids()
print(train_dataset.features['ner_tags'],"\n word_ids", word_ids)
print(align_labels_with_tokens(labels['ner_tags'], word_ids))

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None) 
 word_ids [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]
0
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [26]:
token_train_ds =  train_dataset.map(tokenization_NER)
token_train_ds 

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 14041
})

In [27]:
token_train_ds['ner_tags'][:5]

[[3, 0, 7, 0, 0, 0, 7, 0, 0],
 [1, 2],
 [5, 0],
 [0,
  3,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [5,
  0,
  0,
  0,
  0,
  3,
  4,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0]]

Using Tansformering fuction data colleciton for token classification to tokenize, padding and arraning the NER labels 

In [28]:
from transformers import DataCollatorForTokenClassification, AutoTokenizer, AutoModel

checkpoint  = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

datacollector = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

In [29]:
x_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 14041
})

In [30]:
temp = [x_dataset[i] for i in range(5) ]
temp

[{'input_ids': [101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]},
 {'input_ids': [101, 1943, 14428, 102],
  'token_type_ids': [0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1],
  'labels': [-100, 1, 2, -100]},
 {'input_ids': [101,
   26660,
   13329,
   12649,
   15928,
   1820,
   118,
   4775,
   118,
   1659,
   102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [-100, 5, 6, 6, 6, 0, 0, 0, 0, 0, -100]},
 {'input_ids': [101,
   1109,
   1735,
   2827,
   1163,
   1113,
   9170,
   1122,
   19786,
   1114,
   1528,
   5566,
   1106,
   11060,
   1106,
   188,
   17315,
   1418,
   2495,
   12913,
   1235,
   6479,
   4959,
   2480,
   6340,
   13991,
   3653,
   1169,
   1129,
   12086,


In [31]:
tensor_ds = datacollector(x_dataset)
tensor_ds

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  101,  7270, 22961,  ...,     0,     0,     0],
        [  101,  1943, 14428,  ...,     0,     0,     0],
        [  101, 26660, 13329,  ...,     0,     0,     0],
        ...,
        [  101, 10033,   123,  ...,     0,     0,     0],
        [  101,  1784,  1210,  ...,     0,     0,     0],
        [  101, 17057,   122,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[-100,    3,    0,  ..., -100, -100, -100],
        [-100,    1,    2,  ..., -100, -100, -100],
        [-100,    5,    6,  ..., -1

Create the evaluation function

In [32]:
import evaluate

metric = evaluate.load("seqeval")

In [33]:
NER_label = token_train_ds.features['ner_tags'].feature.names 
NER_label

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [34]:
labels = token_train_ds[0]['ner_tags']
true_label = [NER_label[i] for i in labels] 
true_label

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [35]:
test_label = true_label.copy()
test_label[2] = '0'
test_label, true_label

(['B-ORG', 'O', '0', 'O', 'O', 'O', 'B-MISC', 'O', 'O'],
 ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'])

In [36]:
metric.compute(predictions= [test_label], references= [true_label] )



{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

Lets see what our model output produce

In [37]:
from transformers import AutoModel

In [38]:
model = AutoModel.from_pretrained("Bert-cased-NER")
model

Some weights of BertModel were not initialized from the model checkpoint at Bert-cased-NER and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [39]:
tensor_ds.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [40]:
temp_tensor = tensor_ds.copy()
temp_tensor.pop("labels")
temp_tensor.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

model (temp_tensor)

In [41]:
def eval_fun (eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    true_label = [label for label in labels if label != -100]
    

    return 0

In [42]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[ner_labels[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [ner_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

Working on Model

In [43]:
NER_label

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [44]:
id2label = {i:labels for i, labels in enumerate(NER_label)}
id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [45]:
label2id = {v: k for k, v in enumerate(NER_label)}
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [46]:
from transformers import AutoModelForTokenClassification

checkpoint = 'bert-base-cased'

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label= id2label,
    label2id = label2id

)
model

OSError: Error no file named pytorch_model.bin, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory bert-base-cased.

In [None]:
dataset = dataset.map(tokenization_NER, remove_columns=dataset["train"].column_names)
dataset

In [None]:
final_ds = dataset
final_ds = final_ds.remove_columns(['labels'])

final_ds

In [None]:
dataset,final_ds

In [None]:
from transformers import TrainingArguments, Trainer, AutoModel

checkpoint = 'bert-base-cased'

In [None]:
# Tryin to resolve model issue
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-cased')
unmasker("Hello I'm a [MASK] model.")

In [None]:
model = AutoModel.from_pretrained(checkpoint)

In [None]:
training_args = TrainingArguments(
    checkpoint,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01

)

In [None]:
trainer = Trainer(
    model=model,
    args=  training_args,
    data_collator= datacollector,
    tokenizer=tokenizer,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    compute_metrics=compute_metrics

)

trainer.train()

In [None]:
trainer.save_model('bert-cased-ner-sam')