In [1]:
!pip install  evaluate
!pip install seqeval

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ | done
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l- \ | done
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=0d728d0fd738a88d5aacb35912917f22410303ee5fc69bbd8ae7c1a8f16ceef9
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built se

In [2]:
import datasets 
import numpy as np 
import torch 
from evaluate import load
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 
from transformers import TrainingArguments, Trainer 

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [4]:
conll2003 = datasets.load_dataset("conll2003", trust_remote_code=True) 
conll2003

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [5]:
conll2003["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [6]:
conll2003["train"].features["ner_tags"]


Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [7]:
ner_classes = conll2003["train"].features["ner_tags"].feature.names
ner_classes

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [8]:
example = conll2003["train"][345]
example['tokens'] ,[ner_classes[i] for i in example['ner_tags']]

(['SOCCER', '-', 'GLORIA', 'BISTRITA', 'BEAT', '2-1', 'F.C.', 'VALLETTA', '.'],
 ['O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'B-ORG', 'I-ORG', 'O'])

In [9]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
def tokenize_and_update_labels(example):
    tokens =   ["[CLS]"] + example["tokens"] + ["[SEP]"]
    text_labels = [0] + example["ner_tags"] + [0]

    tokenized_sentence = []
    labels = []

    for word, label in zip(tokens, text_labels):

        # We will input one word at a time and if it breaks into subwords we will extend the labels as well. 
        # Play will be tokenised and len(tokenized_word) will be 1 but 
        # playing will be tokenised and give len will be 2 'Play' and '##ing'. 
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        
        # print(n_subwords)

        # adding all the words in sentence in tokenized_sentence 
        tokenized_sentence.extend(tokenized_word)

        # We will add same label for subwords. 
        labels.extend([label] * n_subwords)

    input_ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)
    attention_mask = [1] * len(tokenized_sentence)
    
    return {"input_ids": input_ids, "attention_mask": attention_mask,
            "tokenized_tokens": tokenized_sentence, "tokenized_labels": labels}

In [11]:
result = tokenize_and_update_labels(example)
" ".join(result['tokenized_tokens']), ",".join([ner_classes[i] for i in result['tokenized_labels']])

('[CLS] soccer - gloria bis ##tri ##ta beat 2 - 1 f . c . valle ##tta . [SEP]',
 'O,O,O,B-ORG,I-ORG,I-ORG,I-ORG,O,O,O,O,B-ORG,B-ORG,B-ORG,B-ORG,I-ORG,I-ORG,O,O')

In [12]:
result['tokenized_tokens']

['[CLS]',
 'soccer',
 '-',
 'gloria',
 'bis',
 '##tri',
 '##ta',
 'beat',
 '2',
 '-',
 '1',
 'f',
 '.',
 'c',
 '.',
 'valle',
 '##tta',
 '.',
 '[SEP]']

In [13]:
for word, label in zip(result['tokenized_tokens'], result['tokenized_labels']):
    print(word,'-'*10, label, '-'*20, ner_classes[label]) 

[CLS] ---------- 0 -------------------- O
soccer ---------- 0 -------------------- O
- ---------- 0 -------------------- O
gloria ---------- 3 -------------------- B-ORG
bis ---------- 4 -------------------- I-ORG
##tri ---------- 4 -------------------- I-ORG
##ta ---------- 4 -------------------- I-ORG
beat ---------- 0 -------------------- O
2 ---------- 0 -------------------- O
- ---------- 0 -------------------- O
1 ---------- 0 -------------------- O
f ---------- 3 -------------------- B-ORG
. ---------- 3 -------------------- B-ORG
c ---------- 3 -------------------- B-ORG
. ---------- 3 -------------------- B-ORG
valle ---------- 4 -------------------- I-ORG
##tta ---------- 4 -------------------- I-ORG
. ---------- 0 -------------------- O
[SEP] ---------- 0 -------------------- O


In [14]:
tokenized_datasets = conll2003.map(tokenize_and_update_labels)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [15]:
tokenized_datasets['train'][345]

{'id': '345',
 'tokens': ['SOCCER',
  '-',
  'GLORIA',
  'BISTRITA',
  'BEAT',
  '2-1',
  'F.C.',
  'VALLETTA',
  '.'],
 'pos_tags': [21, 8, 22, 22, 21, 11, 21, 22, 7],
 'chunk_tags': [11, 0, 11, 12, 7, 11, 12, 12, 0],
 'ner_tags': [0, 0, 3, 4, 0, 0, 3, 4, 0],
 'input_ids': [101,
  4715,
  1011,
  10778,
  20377,
  18886,
  2696,
  3786,
  1016,
  1011,
  1015,
  1042,
  1012,
  1039,
  1012,
  20171,
  5946,
  1012,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'tokenized_tokens': ['[CLS]',
  'soccer',
  '-',
  'gloria',
  'bis',
  '##tri',
  '##ta',
  'beat',
  '2',
  '-',
  '1',
  'f',
  '.',
  'c',
  '.',
  'valle',
  '##tta',
  '.',
  '[SEP]'],
 'tokenized_labels': [0, 0, 0, 3, 4, 4, 4, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 0, 0]}

In [16]:
args = TrainingArguments( 
"test-ner",
eval_strategy = "epoch", 
learning_rate=2e-5, 
per_device_train_batch_size=16, 
per_device_eval_batch_size=16, 
num_train_epochs=3, 
weight_decay=0.01, 
) 


In [17]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=9)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
data_collator = DataCollatorForTokenClassification(tokenizer) 

In [19]:
metric = load("seqeval") 

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [20]:
labels = [ner_classes[i] for i in example["ner_tags"]] 
metric.compute(predictions=[labels], references=[labels]) 

{'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [21]:
example["ner_tags"]

[0, 0, 3, 4, 0, 0, 3, 4, 0]