<a href="https://colab.research.google.com/github/vutt-ai-models/transformers_tutorials/blob/main/ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from flair.models import SequenceTagger
from flair.embeddings import TransformerWordEmbeddings
from flair.data import Corpus
from flair.datasets import ColumnCorpus

In [None]:
# define columns
columns = {0 : 'text', 1 : 'ner'}
# directory where the data resides
data_folder = '/content/drive/MyDrive/Omer_NER'
# initializing the corpus
corpus = ColumnCorpus(data_folder, columns,
                      train_file = 'train_slots.txt',
                      test_file = 'test_slots.txt',
                      dev_file = 'valid_slots.txt')

2023-06-18 05:21:09,252 Reading data from /content/drive/MyDrive/Omer_NER
2023-06-18 05:21:09,257 Train: /content/drive/MyDrive/Omer_NER/train_slots.txt
2023-06-18 05:21:09,265 Dev: /content/drive/MyDrive/Omer_NER/valid_slots.txt
2023-06-18 05:21:09,267 Test: /content/drive/MyDrive/Omer_NER/test_slots.txt


In [None]:
# tag to predict
tag_type = 'ner'
# make tag dictionary from the corpus
tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)
alist = [line.rstrip() for line in open('/content/drive/MyDrive/Omer_NER/slots.txt')]
for i in alist:
    tag_dictionary.add_item(i)

2023-06-18 05:21:11,103 Computing label dictionary. Progress:


4042it [00:00, 34930.21it/s]

2023-06-18 05:21:11,229 Dictionary created for label 'ner' with 108 values: NUMBER (seen 2097 times), MY (seen 611 times), DEVICE (seen 304 times), TOP_UP (seen 161 times), CHANGE (seen 158 times), BILL (seen 156 times), INTERNET (seen 155 times), PAYMENT (seen 140 times), BUNDLE (seen 137 times), SIM (seen 136 times), DATE (seen 126 times), CARD (seen 115 times), UPGRADING (seen 103 times), CALL (seen 99 times), NO (seen 89 times), AMOUNT (seen 88 times), ACCOUNT (seen 86 times), ACTIVATION (seen 83 times), CANCELLATION (seen 81 times), PURCHASE (seen 80 times)





In [None]:
# 4. initialize fine-tuneable transformer embeddings WITH document context
embeddings = TransformerWordEmbeddings(model='bert-base-uncased',
                                       layers="-1",
                                       subtoken_pooling="first",
                                       fine_tune=True,
                                       use_context=True
                                       )

In [None]:
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=tag_dictionary,
                        tag_type=tag_type,
                        use_rnn=True,
                        reproject_embeddings=True,
                        tag_format="BIO",
                        use_crf=True)

2023-06-18 05:21:18,600 SequenceTagger predicts: Dictionary with 251 tags: O, B-NUMBER, I-NUMBER, B-MY, I-MY, B-DEVICE, I-DEVICE, B-TOP_UP, I-TOP_UP, B-CHANGE, I-CHANGE, B-BILL, I-BILL, B-INTERNET, I-INTERNET, B-PAYMENT, I-PAYMENT, B-BUNDLE, I-BUNDLE, B-SIM, I-SIM, B-DATE, I-DATE, B-CARD, I-CARD, B-UPGRADING, I-UPGRADING, B-CALL, I-CALL, B-NO, I-NO, B-AMOUNT, I-AMOUNT, B-ACCOUNT, I-ACCOUNT, B-ACTIVATION, I-ACTIVATION, B-CANCELLATION, I-CANCELLATION, B-PURCHASE, I-PURCHASE, B-MESSAGES, I-MESSAGES, B-MONEY, I-MONEY, B-CONNECTION, I-CONNECTION, B-LOST, I-LOST, B-CONTRACT


In [None]:
from flair.trainers import ModelTrainer
from flair.training_utils import AnnealOnPlateau
from torch.optim.adamw import AdamW
trainer = ModelTrainer(tagger, corpus)

trainer.train('/content/drive/MyDrive/Omer_NER',
              learning_rate=1e-4,
              mini_batch_size=32,
              max_epochs=150,
              scheduler=AnnealOnPlateau,
              optimizer=AdamW)

2023-06-18 05:21:20,065 ----------------------------------------------------------------------------------------------------
2023-06-18 05:21:20,072 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30523, 768)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(



2023-06-18 05:21:20,105  - anneal_factor: "0.5"
2023-06-18 05:21:20,106  - max_epochs: "150"
2023-06-18 05:21:20,113  - shuffle: "True"
2023-06-18 05:21:20,114  - train_with_dev: "False"
2023-06-18 05:21:20,115  - batch_growth_annealing: "False"
2023-06-18 05:21:20,116 ----------------------------------------------------------------------------------------------------
2023-06-18 05:21:20,123 Model training base path: "/content/drive/MyDrive/Omer_NER"
2023-06-18 05:21:20,124 ----------------------------------------------------------------------------------------------------
2023-06-18 05:21:20,125 Device: cuda:0
2023-06-18 05:21:20,126 ----------------------------------------------------------------------------------------------------
2023-06-18 05:21:20,132 Embeddings storage mode: cpu
2023-06-18 05:21:20,133 ----------------------------------------------------------------------------------------------------
2023-06-18 05:21:33,140 epoch 1 - iter 12/127 - loss 5.83337326 - time (sec): 

100%|██████████| 24/24 [00:11<00:00,  2.15it/s]

2023-06-18 05:24:14,228 Evaluating as a multi-label problem: False
2023-06-18 05:24:14,256 DEV : loss 2.1678953170776367 - f1-score (micro avg)  0.0038
2023-06-18 05:24:14,293 BAD EPOCHS (no improvement): 0
2023-06-18 05:24:14,299 saving best model





2023-06-18 05:24:16,156 ----------------------------------------------------------------------------------------------------
2023-06-18 05:24:31,983 epoch 2 - iter 12/127 - loss 2.60380987 - time (sec): 15.82 - samples/sec: 257.73 - lr: 0.000100
2023-06-18 05:24:48,579 epoch 2 - iter 24/127 - loss 2.04114150 - time (sec): 32.42 - samples/sec: 255.07 - lr: 0.000100
2023-06-18 05:25:05,053 epoch 2 - iter 36/127 - loss 1.70841891 - time (sec): 48.89 - samples/sec: 253.21 - lr: 0.000100
2023-06-18 05:25:20,678 epoch 2 - iter 48/127 - loss 1.49876947 - time (sec): 64.52 - samples/sec: 256.91 - lr: 0.000100
2023-06-18 05:25:37,506 epoch 2 - iter 60/127 - loss 1.38450689 - time (sec): 81.35 - samples/sec: 254.19 - lr: 0.000100
2023-06-18 05:25:54,168 epoch 2 - iter 72/127 - loss 1.30755905 - time (sec): 98.01 - samples/sec: 252.09 - lr: 0.000100
2023-06-18 05:26:10,635 epoch 2 - iter 84/127 - loss 1.25025430 - time (sec): 114.48 - samples/sec: 251.02 - lr: 0.000100
2023-06-18 05:26:27,558 epo

100%|██████████| 24/24 [00:10<00:00,  2.25it/s]

2023-06-18 05:27:18,650 Evaluating as a multi-label problem: False
2023-06-18 05:27:18,689 DEV : loss 0.5901839733123779 - f1-score (micro avg)  0.3976
2023-06-18 05:27:18,751 BAD EPOCHS (no improvement): 0
2023-06-18 05:27:18,759 saving best model





2023-06-18 05:27:24,744 ----------------------------------------------------------------------------------------------------
2023-06-18 05:27:40,853 epoch 3 - iter 12/127 - loss 0.65482528 - time (sec): 16.10 - samples/sec: 257.31 - lr: 0.000100
2023-06-18 05:27:57,503 epoch 3 - iter 24/127 - loss 0.63700302 - time (sec): 32.76 - samples/sec: 250.83 - lr: 0.000100
2023-06-18 05:28:14,303 epoch 3 - iter 36/127 - loss 0.63832180 - time (sec): 49.56 - samples/sec: 254.67 - lr: 0.000100
2023-06-18 05:28:30,701 epoch 3 - iter 48/127 - loss 0.61518846 - time (sec): 65.95 - samples/sec: 258.65 - lr: 0.000100
2023-06-18 05:28:46,347 epoch 3 - iter 60/127 - loss 0.62129927 - time (sec): 81.60 - samples/sec: 255.71 - lr: 0.000100
2023-06-18 05:29:02,399 epoch 3 - iter 72/127 - loss 0.61821847 - time (sec): 97.65 - samples/sec: 255.94 - lr: 0.000100
2023-06-18 05:29:19,017 epoch 3 - iter 84/127 - loss 0.60865553 - time (sec): 114.27 - samples/sec: 254.55 - lr: 0.000100
2023-06-18 05:29:35,280 epo

100%|██████████| 24/24 [00:11<00:00,  2.14it/s]

2023-06-18 05:30:28,990 Evaluating as a multi-label problem: False
2023-06-18 05:30:29,040 DEV : loss 0.4293428957462311 - f1-score (micro avg)  0.5505
2023-06-18 05:30:29,099 BAD EPOCHS (no improvement): 0
2023-06-18 05:30:29,108 saving best model





2023-06-18 05:30:30,914 ----------------------------------------------------------------------------------------------------
2023-06-18 05:30:47,047 epoch 4 - iter 12/127 - loss 0.53569324 - time (sec): 16.13 - samples/sec: 253.89 - lr: 0.000100
2023-06-18 05:31:02,927 epoch 4 - iter 24/127 - loss 0.49611626 - time (sec): 32.01 - samples/sec: 250.03 - lr: 0.000100
2023-06-18 05:31:19,844 epoch 4 - iter 36/127 - loss 0.45918288 - time (sec): 48.93 - samples/sec: 250.19 - lr: 0.000100
2023-06-18 05:31:36,102 epoch 4 - iter 48/127 - loss 0.46047065 - time (sec): 65.18 - samples/sec: 250.40 - lr: 0.000100
2023-06-18 05:31:52,711 epoch 4 - iter 60/127 - loss 0.45595649 - time (sec): 81.79 - samples/sec: 250.47 - lr: 0.000100
2023-06-18 05:32:08,679 epoch 4 - iter 72/127 - loss 0.46376405 - time (sec): 97.76 - samples/sec: 251.04 - lr: 0.000100
2023-06-18 05:32:24,227 epoch 4 - iter 84/127 - loss 0.46357483 - time (sec): 113.31 - samples/sec: 251.27 - lr: 0.000100
2023-06-18 05:32:41,009 epo

100%|██████████| 24/24 [00:10<00:00,  2.31it/s]

2023-06-18 05:33:32,077 Evaluating as a multi-label problem: False
2023-06-18 05:33:32,112 DEV : loss 0.29185226559638977 - f1-score (micro avg)  0.6911
2023-06-18 05:33:32,150 BAD EPOCHS (no improvement): 0
2023-06-18 05:33:32,156 saving best model





2023-06-18 05:33:36,396 ----------------------------------------------------------------------------------------------------
2023-06-18 05:33:52,739 epoch 5 - iter 12/127 - loss 0.40697303 - time (sec): 16.34 - samples/sec: 248.10 - lr: 0.000100
2023-06-18 05:34:08,749 epoch 5 - iter 24/127 - loss 0.37595789 - time (sec): 32.35 - samples/sec: 254.62 - lr: 0.000100
2023-06-18 05:34:25,605 epoch 5 - iter 36/127 - loss 0.34830335 - time (sec): 49.21 - samples/sec: 255.42 - lr: 0.000100
2023-06-18 05:34:42,148 epoch 5 - iter 48/127 - loss 0.33936503 - time (sec): 65.75 - samples/sec: 258.98 - lr: 0.000100
2023-06-18 05:34:58,779 epoch 5 - iter 60/127 - loss 0.33733682 - time (sec): 82.38 - samples/sec: 255.49 - lr: 0.000100
2023-06-18 05:35:14,069 epoch 5 - iter 72/127 - loss 0.33968453 - time (sec): 97.67 - samples/sec: 256.73 - lr: 0.000100
2023-06-18 05:35:29,775 epoch 5 - iter 84/127 - loss 0.33752530 - time (sec): 113.38 - samples/sec: 255.39 - lr: 0.000100
2023-06-18 05:35:46,081 epo

100%|██████████| 24/24 [00:10<00:00,  2.23it/s]

2023-06-18 05:36:38,077 Evaluating as a multi-label problem: False
2023-06-18 05:36:38,102 DEV : loss 0.2131327986717224 - f1-score (micro avg)  0.7869
2023-06-18 05:36:38,146 BAD EPOCHS (no improvement): 0
2023-06-18 05:36:38,152 saving best model





2023-06-18 05:36:39,825 ----------------------------------------------------------------------------------------------------
2023-06-18 05:36:56,515 epoch 6 - iter 12/127 - loss 0.28412948 - time (sec): 16.69 - samples/sec: 258.46 - lr: 0.000100
2023-06-18 05:37:13,357 epoch 6 - iter 24/127 - loss 0.26201184 - time (sec): 33.53 - samples/sec: 259.11 - lr: 0.000100
2023-06-18 05:37:29,746 epoch 6 - iter 36/127 - loss 0.26009655 - time (sec): 49.92 - samples/sec: 253.87 - lr: 0.000100
2023-06-18 05:37:45,637 epoch 6 - iter 48/127 - loss 0.25583722 - time (sec): 65.81 - samples/sec: 254.22 - lr: 0.000100
2023-06-18 05:38:02,031 epoch 6 - iter 60/127 - loss 0.25092206 - time (sec): 82.20 - samples/sec: 251.20 - lr: 0.000100
2023-06-18 05:38:18,015 epoch 6 - iter 72/127 - loss 0.25537203 - time (sec): 98.19 - samples/sec: 250.70 - lr: 0.000100
2023-06-18 05:38:35,001 epoch 6 - iter 84/127 - loss 0.25464691 - time (sec): 115.17 - samples/sec: 249.76 - lr: 0.000100
2023-06-18 05:38:50,875 epo

100%|██████████| 24/24 [00:11<00:00,  2.16it/s]

2023-06-18 05:39:42,663 Evaluating as a multi-label problem: False
2023-06-18 05:39:42,689 DEV : loss 0.15095172822475433 - f1-score (micro avg)  0.8445
2023-06-18 05:39:42,731 BAD EPOCHS (no improvement): 0
2023-06-18 05:39:42,742 saving best model





2023-06-18 05:39:46,542 ----------------------------------------------------------------------------------------------------
2023-06-18 05:40:03,708 epoch 7 - iter 12/127 - loss 0.19479224 - time (sec): 17.16 - samples/sec: 259.37 - lr: 0.000100
2023-06-18 05:40:19,925 epoch 7 - iter 24/127 - loss 0.19520579 - time (sec): 33.38 - samples/sec: 258.62 - lr: 0.000100
2023-06-18 05:40:28,303 ----------------------------------------------------------------------------------------------------
2023-06-18 05:40:28,307 Exiting from training early.
2023-06-18 05:40:28,309 Saving model ...
2023-06-18 05:40:30,167 Done.
2023-06-18 05:40:30,175 ----------------------------------------------------------------------------------------------------
2023-06-18 05:40:33,273 SequenceTagger predicts: Dictionary with 253 tags: O, B-NUMBER, I-NUMBER, B-MY, I-MY, B-DEVICE, I-DEVICE, B-TOP_UP, I-TOP_UP, B-CHANGE, I-CHANGE, B-BILL, I-BILL, B-INTERNET, I-INTERNET, B-PAYMENT, I-PAYMENT, B-BUNDLE, I-BUNDLE, B-SIM, 

100%|██████████| 23/23 [00:10<00:00,  2.27it/s]

2023-06-18 05:40:43,609 Evaluating as a multi-label problem: False
2023-06-18 05:40:43,645 0.8684	0.8944	0.8812	0.8531
2023-06-18 05:40:43,647 
Results:
- F-score (micro) 0.8812
- F-score (macro) 0.481
- Accuracy 0.8531

By class:
                precision    recall  f1-score   support

        NUMBER     0.9942    1.0000    0.9971       341
            MY     0.9455    1.0000    0.9720       104
        DEVICE     0.9804    1.0000    0.9901        50
        CHANGE     1.0000    1.0000    1.0000        41
          BILL     0.9697    1.0000    0.9846        32
        TOP_UP     0.9655    1.0000    0.9825        28
      INTERNET     0.8966    1.0000    0.9455        26
       PAYMENT     0.9231    1.0000    0.9600        24
     UPGRADING     1.0000    1.0000    1.0000        25
        BUNDLE     0.6296    0.8947    0.7391        19
          CARD     0.8696    1.0000    0.9302        20
           SIM     0.9524    1.0000    0.9756        20
      CONTRACT     0.9500    1.0000    0




{'test_score': 0.8811795316565482,
 'dev_score_history': [0.0037579857196542656,
  0.39756415832970854,
  0.5504587155963302,
  0.6911083017277707,
  0.7868988391376452,
  0.8445360824742267],
 'train_loss_history': [2.1877360389245615,
  1.0588149778054796,
  0.5921017241792514,
  0.439745923691718,
  0.3281521966552171,
  0.23862481277821984],
 'dev_loss_history': [2.1678953170776367,
  0.5901839733123779,
  0.4293428957462311,
  0.29185226559638977,
  0.2131327986717224,
  0.15095172822475433]}

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger
# load the trained model
model = SequenceTagger.load('/content/drive/MyDrive/Omer_NER/best-model.pt')
# create example sentence


2023-06-18 05:40:46,641 SequenceTagger predicts: Dictionary with 253 tags: O, B-NUMBER, I-NUMBER, B-MY, I-MY, B-DEVICE, I-DEVICE, B-TOP_UP, I-TOP_UP, B-CHANGE, I-CHANGE, B-BILL, I-BILL, B-INTERNET, I-INTERNET, B-PAYMENT, I-PAYMENT, B-BUNDLE, I-BUNDLE, B-SIM, I-SIM, B-DATE, I-DATE, B-CARD, I-CARD, B-UPGRADING, I-UPGRADING, B-CALL, I-CALL, B-NO, I-NO, B-AMOUNT, I-AMOUNT, B-ACCOUNT, I-ACCOUNT, B-ACTIVATION, I-ACTIVATION, B-CANCELLATION, I-CANCELLATION, B-PURCHASE, I-PURCHASE, B-MESSAGES, I-MESSAGES, B-MONEY, I-MONEY, B-CONNECTION, I-CONNECTION, B-LOST, I-LOST, B-CONTRACT


In [None]:
sentence = Sentence("customer service")
# predict the tags
model.predict(sentence)


In [None]:
print(sentence.to_tagged_string())

Sentence[2]: "customer service" → ["customer"/CONNECTION, "service"/SERVICE]
