# NER with Flair

In [2]:
import pandas as pd
from flair.data import Sentence
from flair.models import SequenceTagger
import logging
import json
#Import flair modules
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, BytePairEmbeddings, TransformerWordEmbeddings

from flair.models import SequenceTagger
from flair.trainers import ModelTrainer


  from .autonotebook import tqdm as notebook_tqdm


In [68]:
# Data preparation
data_path = 'data_bio_shuffled_2.csv'
data_df = pd.read_csv(data_path)
#data_df['labels'] = data_df.labels.replace('I-BRAND','B-BRAND')

In [69]:
# Creating a BILUO tagging scheme 
# Flair accept ner data format with word label in each line of text file with empty line for new sentence
path = 'data/flair/bio/'
def to_biluo(data,fn):
    sentence_df = data.groupby('sentence_id')
    f = open(path+fn,'w', encoding='utf-8')
    for name, sentence_grp in sentence_df:
        for i,item in sentence_grp.iterrows():
            idx = str(i)
            word = str(item['words'])
            tag = str(item['labels'])
            f.write(f"{idx} {word} {tag}\n")
        f.write('\n')
    f.close()


In [70]:
data_df.sentence_id.max()

64934

In [71]:
data_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,AmazonBasics,U-BRAND
1,0,PETG,O
2,0,3D,O
3,0,Printer,O
4,0,Filament,O


Let's split the data into train, dev and test sets that will be used to train and evaluate the model respectively

In [115]:
# Creating train.txt test.txt and dev.txt

idx_train = 59008
idx_dev = 60000
idx_test = 64934

#idx_train = 28008
#idx_dev = 29000
#idx_test = 35934

df_train = data_df[data_df.sentence_id <= idx_train]
df_dev = data_df[(data_df.sentence_id > idx_train) & (data_df.sentence_id <= idx_dev)]
df_test = data_df[(data_df.sentence_id > idx_dev) & (data_df.sentence_id <= idx_test)]

In [116]:
df_train.shape

(1126038, 3)

In [117]:
df_train.head()

Unnamed: 0,sentence_id,words,labels
0,0,AmazonBasics,U-BRAND
1,0,PETG,O
2,0,3D,O
3,0,Printer,O
4,0,Filament,O


In [118]:
df_test.head()

Unnamed: 0,sentence_id,words,labels
1144987,60001,Amazon,B-BRAND
1144988,60001,Brand,I-BRAND
1144989,60001,-,I-BRAND
1144990,60001,Solimo,L-BRAND
1144991,60001,Designer,O


Unnamed: 0,sentence_id,words,labels
553128,29002,Amazon,B-BRAND
553129,29002,Brand,I-BRAND
553130,29002,-,I-BRAND
553131,29002,Solimo,L-BRAND
553132,29002,Designer,O
...,...,...,...
685342,35934,Cover,O
685343,35934,for,O
685344,35934,Coolpad,B-MODEL
685345,35934,Mega,I-MODEL


In [None]:
#df_train.to_csv('data/flair/bio/train.csv', sep='\t', index = True, header = True)
#df_test.to_csv('data/flair/bio/test.csv', sep='\t', index = False, header = False)
#df_dev.to_csv('data/flair/bio/dev.csv', sep='\t', index = False, header = False)

In [119]:
to_biluo(df_test[23:],'train.txt')
to_biluo(df_test,'test.txt')
to_biluo(df_dev,'dev.txt')

## Training


In [120]:
# Creating a corpus object
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
#columns = {0: 'text', 1: 'ner'}
columns = {0: 'idx', 1: 'text', 2: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = path

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')

# encoding='cp1252'
#corpus.downsample(0.01)

2023-04-12 21:47:55,408 Reading data from data\flair\bio
2023-04-12 21:47:55,410 Train: data\flair\bio\train.txt
2023-04-12 21:47:55,410 Dev: data\flair\bio\dev.txt
2023-04-12 21:47:55,411 Test: data\flair\bio\test.txt


Auxiliary testing code
from flair.data import Corpus
from flair.datasets import ColumnCorpus

 define columns
columns = {0: 'idx', 1: 'text', 2: 'ner'}

 this is the folder in which train, test and dev files reside
data_folder = path

corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')
corpus.downsample(0.01)

Let's check the size of the corpus as a basic sanity check

In [121]:
print(corpus)

Corpus: 4933 train + 992 dev + 4934 test sentences


In [122]:
print(corpus.train[0])

Sentence[19]: "- Solimo Designer Multicolor Canvas 3D Printed Hard Back Case Mobile Cover for Samsung Galaxy J2 ( 2016 )" → ["-"/1145010, "- Solimo"/BRAND, "Solimo"/1145011, "Designer"/1145012, "Multicolor"/1145013, "Canvas"/1145014, "3D"/1145015, "Printed"/1145016, "Hard"/1145017, "Back"/1145018, "Case"/1145019, "Mobile"/1145020, "Cover"/1145021, "for"/1145022, "Samsung"/1145023, "Samsung Galaxy J2 ( 2016 )"/MODEL, "Galaxy"/1145024, "J2"/1145025, "("/1145026, "2016"/1145027, ")"/1145028]


In [123]:
print(corpus.dev[0])

Sentence[20]: "Amazon Brand - Solimo Designer Marble Printed Hard Back Case Mobile Cover for Samsung Galaxy Note 9 ( D218 )" → ["Amazon"/1126038, "Amazon Brand - Solimo"/BRAND, "Brand"/1126039, "-"/1126040, "Solimo"/1126041, "Designer"/1126042, "Marble"/1126043, "Printed"/1126044, "Hard"/1126045, "Back"/1126046, "Case"/1126047, "Mobile"/1126048, "Cover"/1126049, "for"/1126050, "Samsung"/1126051, "Samsung Galaxy Note 9"/MODEL, "Galaxy"/1126052, "Note"/1126053, "9"/1126054, "("/1126055, "D218"/1126056, ")"/1126057]


The following sanity check is to check unexpected behaviour after creating the corpus. In some case some rows of data can trigger all span vectors in a set (train, test, or dev) to be of type Token, which is not iterable and cannot be used to train a Flair model.

In [125]:
#sanity check

sentencesxa = corpus.train
eval_labels = []
count_token_only = 0
count_iters = 0

for sentence in sentencesxa:
    sentence_labels = ["O"] * len(sentence)
    for label in sentence.get_labels('ner'):
        span = label.data_point
        #print(span)
        if not hasattr(span, '__iter__'):
            #print(span)
            count_token_only += 1
        else:
            count_iters += 1
            if len(span) == 1:
                sentence_labels[span[0].idx - 1] = "B-" + label.value
            else:
                sentence_labels[span[0].idx - 1] = "B-" + label.value
                sentence_labels[span[-1].idx - 1] = "I-" + label.value
                for i in range(span[0].idx, span[-1].idx - 1):
                    sentence_labels[i] = "I-" + label.value        
    
    print(len(sentence_labels))
    eval_labels.append(sentence_labels)
    
print("token only: ", count_token_only)
print("iters: ", count_iters)

19
19
20
18
17
19
14
22
17
23
20
17
18
16
19
17
26
20
17
17
18
20
20
17
19
19
20
17
19
17
17
17
18
23
18
23
20
18
21
24
20
18
19
20
22
17
18
21
22
16
17
21
24
14
19
17
17
16
18
20
22
18
18
21
17
24
17
17
23
19
16
21
17
22
17
16
19
18
17
17
18
20
18
20
18
18
17
19
17
18
19
20
13
17
17
20
21
19
18
18
21
17
19
19
23
20
19
22
19
20
7
23
19
18
21
21
18
18
19
18
19
19
17
20
19
19
19
18
19
21
20
23
19
19
20
19
19
22
18
17
18
21
18
17
17
17
19
21
17
23
20
18
21
24
18
17
17
19
20
21
18
16
17
21
16
18
18
18
19
18
20
20
22
18
18
19
17
16
21
19
20
17
19
16
18
17
19
19
20
18
17
23
18
17
25
17
18
17
17
18
19
17
18
18
17
20
20
17
18
24
19
18
21
20
19
20
23
16
16
17
20
16
17
19
23
18
23
19
24
19
17
14
19
22
18
19
18
24
17
17
16
20
16
21
19
22
21
19
17
14
20
18
18
23
24
21
17
20
11
18
18
17
17
17
17
17
20
18
19
18
17
18
17
20
18
18
21
19
8
19
19
21
16
18
19
21
21
20
17
22
19
23
22
19
22
16
19
23
17
18
18
18
18
18
18
18
20
18
17
22
24
20
21
19
18
18
27
21
19
17
22
22
19
23
18
17
22
19
20
17
18
19
18
25


The tag type we use is `ner`

In [126]:
# tag to predict
tag_type = 'ner'
# make tag dictionary from the corpus
#tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)

2023-04-12 21:54:01,409 Computing label dictionary. Progress:


4933it [00:00, 47592.05it/s]

2023-04-12 21:54:01,542 Dictionary created for label 'ner' with 4 values: BRAND (seen 4866 times), MODEL (seen 4817 times),  (seen 49 times)





### Checking, with a quick training, for problems in data specification or settings

In [127]:
embeddings= [
    WordEmbeddings('en-glove'),
    FlairEmbeddings('news-forward-fast'),
    FlairEmbeddings('news-backward-fast')
]

embeddings = StackedEmbeddings(embeddings)

tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=tag_dictionary,
                        tag_type=tag_type
                        )

trainer = ModelTrainer(tagger, corpus)


trainer.train('example-ner', mini_batch_size=16,
              max_epochs=1)

2023-04-12 21:54:24,348 SequenceTagger predicts: Dictionary with 13 tags: O, S-BRAND, B-BRAND, E-BRAND, I-BRAND, S-MODEL, B-MODEL, E-MODEL, I-MODEL, S-, B-, E-, I-
2023-04-12 21:54:24,444 ----------------------------------------------------------------------------------------------------
2023-04-12 21:54:24,446 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'en-glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=2148,

100%|█████████████████████████████████████████████████████████████████████████████████| 62/62 [00:07<00:00,  8.82it/s]

2023-04-12 21:55:37,546 Evaluating as a multi-label problem: False
2023-04-12 21:55:37,580 DEV : loss 0.055911850184202194 - f1-score (micro avg)  0.9654





2023-04-12 21:55:37,655 BAD EPOCHS (no improvement): 0
2023-04-12 21:55:37,657 saving best model
2023-04-12 21:55:43,757 ----------------------------------------------------------------------------------------------------
2023-04-12 21:55:51,834 SequenceTagger predicts: Dictionary with 15 tags: O, S-BRAND, B-BRAND, E-BRAND, I-BRAND, S-MODEL, B-MODEL, E-MODEL, I-MODEL, S-, B-, E-, I-, <START>, <STOP>


100%|███████████████████████████████████████████████████████████████████████████████| 309/309 [00:23<00:00, 13.11it/s]

2023-04-12 21:56:15,684 Evaluating as a multi-label problem: False





2023-04-12 21:56:15,751 0.9675	0.958	0.9627	0.93
2023-04-12 21:56:15,752 
Results:
- F-score (micro) 0.9627
- F-score (macro) 0.6435
- Accuracy 0.93

By class:
              precision    recall  f1-score   support

       BRAND     0.9667    0.9790    0.9728      4867
       MODEL     0.9692    0.9465    0.9577      4818
                 0.0000    0.0000    0.0000        49

   micro avg     0.9675    0.9580    0.9627      9734
   macro avg     0.6453    0.6418    0.6435      9734
weighted avg     0.9631    0.9580    0.9604      9734

2023-04-12 21:56:15,753 ----------------------------------------------------------------------------------------------------


{'test_score': 0.9627297129878175,
 'dev_score_history': [0.9653757373685561],
 'train_loss_history': [0.22768328426049866],
 'dev_loss_history': [0.055911850184202194]}

Results:
- F-score (micro) 0.9627
- F-score (macro) 0.6435
- Accuracy 0.93

### Training the definitive model

- Glove 6B and Flair embeddings on news articles in one stacked embedding
- BI-LSTM-CRF

In [132]:
embeddings = [
              WordEmbeddings('glove'),
              FlairEmbeddings('news-forward-fast'),
              FlairEmbeddings('news-backward-fast')
]
embeddings = StackedEmbeddings(embeddings)


tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=tag_dictionary,
                        tag_type=tag_type,
                        use_crf=True
                        )


trainer = ModelTrainer(tagger, corpus)



2023-04-12 22:28:44,772 SequenceTagger predicts: Dictionary with 13 tags: O, S-BRAND, B-BRAND, E-BRAND, I-BRAND, S-MODEL, B-MODEL, E-MODEL, I-MODEL, S-, B-, E-, I-


In [133]:

model_path = 'models/flair/'
model_name = 'flair-ner-amazon'
model_output = (model_path + model_name)

trainer.train(model_output,
              learning_rate=0.03,
              mini_batch_size=32,
              max_epochs=10,
              embeddings_storage_mode='none')


2023-04-12 22:28:55,519 ----------------------------------------------------------------------------------------------------
2023-04-12 22:28:55,521 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=2148, out_features=2148, bias=True)
  (rnn): LSTM(2148, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=15, bias=True)
  (loss_f

100%|█████████████████████████████████████████████████████████████████████████████████| 31/31 [00:04<00:00,  6.97it/s]

2023-04-12 22:29:16,955 Evaluating as a multi-label problem: False





2023-04-12 22:29:17,007 DEV : loss 0.13717596232891083 - f1-score (micro avg)  0.9212
2023-04-12 22:29:17,027 BAD EPOCHS (no improvement): 0
2023-04-12 22:29:17,030 saving best model
2023-04-12 22:29:17,734 ----------------------------------------------------------------------------------------------------
2023-04-12 22:29:19,432 epoch 2 - iter 15/155 - loss 0.16472294 - time (sec): 1.70 - samples/sec: 5389.40 - lr: 0.030000
2023-04-12 22:29:20,856 epoch 2 - iter 30/155 - loss 0.15028263 - time (sec): 3.12 - samples/sec: 5858.35 - lr: 0.030000
2023-04-12 22:29:22,302 epoch 2 - iter 45/155 - loss 0.15210132 - time (sec): 4.57 - samples/sec: 6022.97 - lr: 0.030000
2023-04-12 22:29:23,742 epoch 2 - iter 60/155 - loss 0.14824034 - time (sec): 6.01 - samples/sec: 6107.18 - lr: 0.030000
2023-04-12 22:29:25,180 epoch 2 - iter 75/155 - loss 0.14589346 - time (sec): 7.45 - samples/sec: 6159.63 - lr: 0.030000
2023-04-12 22:29:26,626 epoch 2 - iter 90/155 - loss 0.14177159 - time (sec): 8.89 - sa

100%|█████████████████████████████████████████████████████████████████████████████████| 31/31 [00:04<00:00,  7.28it/s]

2023-04-12 22:29:37,068 Evaluating as a multi-label problem: False
2023-04-12 22:29:37,090 DEV : loss 0.08285310119390488 - f1-score (micro avg)  0.92
2023-04-12 22:29:37,114 BAD EPOCHS (no improvement): 1





2023-04-12 22:29:37,116 ----------------------------------------------------------------------------------------------------
2023-04-12 22:29:38,609 epoch 3 - iter 15/155 - loss 0.10862462 - time (sec): 1.49 - samples/sec: 6089.24 - lr: 0.030000
2023-04-12 22:29:40,045 epoch 3 - iter 30/155 - loss 0.10076137 - time (sec): 2.93 - samples/sec: 6198.09 - lr: 0.030000
2023-04-12 22:29:41,469 epoch 3 - iter 45/155 - loss 0.10092346 - time (sec): 4.35 - samples/sec: 6287.35 - lr: 0.030000
2023-04-12 22:29:42,913 epoch 3 - iter 60/155 - loss 0.10172199 - time (sec): 5.80 - samples/sec: 6301.45 - lr: 0.030000
2023-04-12 22:29:44,389 epoch 3 - iter 75/155 - loss 0.09958016 - time (sec): 7.27 - samples/sec: 6284.70 - lr: 0.030000
2023-04-12 22:29:45,838 epoch 3 - iter 90/155 - loss 0.09741501 - time (sec): 8.72 - samples/sec: 6298.45 - lr: 0.030000
2023-04-12 22:29:47,296 epoch 3 - iter 105/155 - loss 0.09671991 - time (sec): 10.18 - samples/sec: 6304.94 - lr: 0.030000
2023-04-12 22:29:48,765 ep

100%|█████████████████████████████████████████████████████████████████████████████████| 31/31 [00:04<00:00,  6.95it/s]

2023-04-12 22:29:56,737 Evaluating as a multi-label problem: False
2023-04-12 22:29:56,760 DEV : loss 0.06537532061338425 - f1-score (micro avg)  0.9437
2023-04-12 22:29:56,785 BAD EPOCHS (no improvement): 0





2023-04-12 22:29:56,786 saving best model
2023-04-12 22:29:57,515 ----------------------------------------------------------------------------------------------------
2023-04-12 22:29:58,999 epoch 4 - iter 15/155 - loss 0.05475413 - time (sec): 1.48 - samples/sec: 6252.68 - lr: 0.030000
2023-04-12 22:30:00,429 epoch 4 - iter 30/155 - loss 0.06472555 - time (sec): 2.91 - samples/sec: 6311.74 - lr: 0.030000
2023-04-12 22:30:01,921 epoch 4 - iter 45/155 - loss 0.07091528 - time (sec): 4.41 - samples/sec: 6267.87 - lr: 0.030000
2023-04-12 22:30:03,411 epoch 4 - iter 60/155 - loss 0.07031992 - time (sec): 5.90 - samples/sec: 6231.88 - lr: 0.030000
2023-04-12 22:30:04,896 epoch 4 - iter 75/155 - loss 0.07392385 - time (sec): 7.38 - samples/sec: 6216.57 - lr: 0.030000
2023-04-12 22:30:06,381 epoch 4 - iter 90/155 - loss 0.07291746 - time (sec): 8.87 - samples/sec: 6210.12 - lr: 0.030000
2023-04-12 22:30:07,879 epoch 4 - iter 105/155 - loss 0.07100540 - time (sec): 10.36 - samples/sec: 6199.38

100%|█████████████████████████████████████████████████████████████████████████████████| 31/31 [00:04<00:00,  6.74it/s]

2023-04-12 22:30:17,305 Evaluating as a multi-label problem: False





2023-04-12 22:30:17,339 DEV : loss 0.0508183054625988 - f1-score (micro avg)  0.9591
2023-04-12 22:30:17,364 BAD EPOCHS (no improvement): 0
2023-04-12 22:30:17,365 saving best model
2023-04-12 22:30:18,146 ----------------------------------------------------------------------------------------------------
2023-04-12 22:30:19,647 epoch 5 - iter 15/155 - loss 0.05845137 - time (sec): 1.50 - samples/sec: 6124.09 - lr: 0.030000
2023-04-12 22:30:21,145 epoch 5 - iter 30/155 - loss 0.06129741 - time (sec): 3.00 - samples/sec: 6130.30 - lr: 0.030000
2023-04-12 22:30:22,642 epoch 5 - iter 45/155 - loss 0.06273319 - time (sec): 4.49 - samples/sec: 6134.74 - lr: 0.030000
2023-04-12 22:30:24,187 epoch 5 - iter 60/155 - loss 0.06284208 - time (sec): 6.04 - samples/sec: 6090.47 - lr: 0.030000
2023-04-12 22:30:25,663 epoch 5 - iter 75/155 - loss 0.06186600 - time (sec): 7.51 - samples/sec: 6115.73 - lr: 0.030000
2023-04-12 22:30:27,256 epoch 5 - iter 90/155 - loss 0.06014455 - time (sec): 9.11 - sam

100%|█████████████████████████████████████████████████████████████████████████████████| 31/31 [00:04<00:00,  6.94it/s]

2023-04-12 22:30:38,187 Evaluating as a multi-label problem: False
2023-04-12 22:30:38,209 DEV : loss 0.043796606361866 - f1-score (micro avg)  0.9678
2023-04-12 22:30:38,230 BAD EPOCHS (no improvement): 0





2023-04-12 22:30:38,233 saving best model
2023-04-12 22:30:38,980 ----------------------------------------------------------------------------------------------------
2023-04-12 22:30:40,557 epoch 6 - iter 15/155 - loss 0.04406633 - time (sec): 1.58 - samples/sec: 5803.77 - lr: 0.030000
2023-04-12 22:30:42,145 epoch 6 - iter 30/155 - loss 0.04981147 - time (sec): 3.16 - samples/sec: 5776.46 - lr: 0.030000
2023-04-12 22:30:43,662 epoch 6 - iter 45/155 - loss 0.05176782 - time (sec): 4.68 - samples/sec: 5865.98 - lr: 0.030000
2023-04-12 22:30:45,344 epoch 6 - iter 60/155 - loss 0.05008426 - time (sec): 6.36 - samples/sec: 5748.95 - lr: 0.030000
2023-04-12 22:30:47,044 epoch 6 - iter 75/155 - loss 0.05128457 - time (sec): 8.06 - samples/sec: 5678.13 - lr: 0.030000
2023-04-12 22:30:48,567 epoch 6 - iter 90/155 - loss 0.05137340 - time (sec): 9.59 - samples/sec: 5735.73 - lr: 0.030000
2023-04-12 22:30:50,196 epoch 6 - iter 105/155 - loss 0.05145277 - time (sec): 11.22 - samples/sec: 5726.40

100%|█████████████████████████████████████████████████████████████████████████████████| 31/31 [00:04<00:00,  6.87it/s]

2023-04-12 22:30:59,744 Evaluating as a multi-label problem: False
2023-04-12 22:30:59,769 DEV : loss 0.050754088908433914 - f1-score (micro avg)  0.9708





2023-04-12 22:30:59,793 BAD EPOCHS (no improvement): 0
2023-04-12 22:30:59,795 saving best model
2023-04-12 22:31:00,701 ----------------------------------------------------------------------------------------------------
2023-04-12 22:31:02,265 epoch 7 - iter 15/155 - loss 0.04170180 - time (sec): 1.56 - samples/sec: 5850.10 - lr: 0.030000
2023-04-12 22:31:03,793 epoch 7 - iter 30/155 - loss 0.03914249 - time (sec): 3.09 - samples/sec: 5938.95 - lr: 0.030000
2023-04-12 22:31:05,279 epoch 7 - iter 45/155 - loss 0.04391946 - time (sec): 4.58 - samples/sec: 6013.46 - lr: 0.030000
2023-04-12 22:31:06,715 epoch 7 - iter 60/155 - loss 0.04325347 - time (sec): 6.01 - samples/sec: 6098.60 - lr: 0.030000
2023-04-12 22:31:08,279 epoch 7 - iter 75/155 - loss 0.04288914 - time (sec): 7.58 - samples/sec: 6059.31 - lr: 0.030000
2023-04-12 22:31:09,938 epoch 7 - iter 90/155 - loss 0.04433683 - time (sec): 9.24 - samples/sec: 5947.33 - lr: 0.030000
2023-04-12 22:31:11,547 epoch 7 - iter 105/155 - los

100%|█████████████████████████████████████████████████████████████████████████████████| 31/31 [00:04<00:00,  6.63it/s]

2023-04-12 22:31:21,410 Evaluating as a multi-label problem: False
2023-04-12 22:31:21,434 DEV : loss 0.035582661628723145 - f1-score (micro avg)  0.9705





2023-04-12 22:31:21,457 BAD EPOCHS (no improvement): 1
2023-04-12 22:31:21,458 ----------------------------------------------------------------------------------------------------
2023-04-12 22:31:22,979 epoch 8 - iter 15/155 - loss 0.04720817 - time (sec): 1.52 - samples/sec: 6057.45 - lr: 0.030000
2023-04-12 22:31:24,444 epoch 8 - iter 30/155 - loss 0.04291704 - time (sec): 2.99 - samples/sec: 6170.24 - lr: 0.030000
2023-04-12 22:31:25,987 epoch 8 - iter 45/155 - loss 0.04366363 - time (sec): 4.53 - samples/sec: 6089.10 - lr: 0.030000
2023-04-12 22:31:27,511 epoch 8 - iter 60/155 - loss 0.04152722 - time (sec): 6.05 - samples/sec: 6069.20 - lr: 0.030000
2023-04-12 22:31:29,105 epoch 8 - iter 75/155 - loss 0.03953445 - time (sec): 7.65 - samples/sec: 6001.50 - lr: 0.030000
2023-04-12 22:31:30,614 epoch 8 - iter 90/155 - loss 0.03838701 - time (sec): 9.16 - samples/sec: 6008.70 - lr: 0.030000
2023-04-12 22:31:32,122 epoch 8 - iter 105/155 - loss 0.03710203 - time (sec): 10.66 - samples

100%|█████████████████████████████████████████████████████████████████████████████████| 31/31 [00:04<00:00,  6.87it/s]

2023-04-12 22:31:41,572 Evaluating as a multi-label problem: False
2023-04-12 22:31:41,594 DEV : loss 0.03311051428318024 - f1-score (micro avg)  0.9701





2023-04-12 22:31:41,617 BAD EPOCHS (no improvement): 2
2023-04-12 22:31:41,619 ----------------------------------------------------------------------------------------------------
2023-04-12 22:31:43,196 epoch 9 - iter 15/155 - loss 0.03842393 - time (sec): 1.58 - samples/sec: 5807.12 - lr: 0.030000
2023-04-12 22:31:44,720 epoch 9 - iter 30/155 - loss 0.03704123 - time (sec): 3.10 - samples/sec: 5927.13 - lr: 0.030000
2023-04-12 22:31:46,257 epoch 9 - iter 45/155 - loss 0.03825967 - time (sec): 4.64 - samples/sec: 5944.55 - lr: 0.030000
2023-04-12 22:31:47,798 epoch 9 - iter 60/155 - loss 0.03479856 - time (sec): 6.18 - samples/sec: 5951.71 - lr: 0.030000
2023-04-12 22:31:49,306 epoch 9 - iter 75/155 - loss 0.03514920 - time (sec): 7.69 - samples/sec: 5972.57 - lr: 0.030000
2023-04-12 22:31:50,882 epoch 9 - iter 90/155 - loss 0.03581389 - time (sec): 9.26 - samples/sec: 5942.52 - lr: 0.030000
2023-04-12 22:31:52,421 epoch 9 - iter 105/155 - loss 0.03605985 - time (sec): 10.80 - samples

100%|█████████████████████████████████████████████████████████████████████████████████| 31/31 [00:04<00:00,  6.69it/s]

2023-04-12 22:32:02,040 Evaluating as a multi-label problem: False





2023-04-12 22:32:02,063 DEV : loss 0.033190421760082245 - f1-score (micro avg)  0.9725
2023-04-12 22:32:02,084 BAD EPOCHS (no improvement): 0
2023-04-12 22:32:02,085 saving best model
2023-04-12 22:32:02,788 ----------------------------------------------------------------------------------------------------
2023-04-12 22:32:04,339 epoch 10 - iter 15/155 - loss 0.02929458 - time (sec): 1.55 - samples/sec: 5905.28 - lr: 0.030000
2023-04-12 22:32:05,839 epoch 10 - iter 30/155 - loss 0.03037332 - time (sec): 3.05 - samples/sec: 6002.22 - lr: 0.030000
2023-04-12 22:32:07,412 epoch 10 - iter 45/155 - loss 0.02982489 - time (sec): 4.62 - samples/sec: 5938.63 - lr: 0.030000
2023-04-12 22:32:08,958 epoch 10 - iter 60/155 - loss 0.03008587 - time (sec): 6.17 - samples/sec: 5936.52 - lr: 0.030000
2023-04-12 22:32:10,505 epoch 10 - iter 75/155 - loss 0.03089079 - time (sec): 7.72 - samples/sec: 5939.84 - lr: 0.030000
2023-04-12 22:32:12,068 epoch 10 - iter 90/155 - loss 0.03098164 - time (sec): 9.

100%|█████████████████████████████████████████████████████████████████████████████████| 31/31 [00:04<00:00,  6.66it/s]

2023-04-12 22:32:23,354 Evaluating as a multi-label problem: False





2023-04-12 22:32:23,383 DEV : loss 0.027923069894313812 - f1-score (micro avg)  0.9771
2023-04-12 22:32:23,408 BAD EPOCHS (no improvement): 0
2023-04-12 22:32:23,410 saving best model
2023-04-12 22:32:24,950 ----------------------------------------------------------------------------------------------------
2023-04-12 22:32:25,643 SequenceTagger predicts: Dictionary with 15 tags: O, S-BRAND, B-BRAND, E-BRAND, I-BRAND, S-MODEL, B-MODEL, E-MODEL, I-MODEL, S-, B-, E-, I-, <START>, <STOP>


100%|███████████████████████████████████████████████████████████████████████████████| 155/155 [00:20<00:00,  7.68it/s]

2023-04-12 22:32:46,066 Evaluating as a multi-label problem: False
2023-04-12 22:32:46,126 0.9796	0.98	0.9798	0.9618
2023-04-12 22:32:46,126 
Results:
- F-score (micro) 0.9798
- F-score (macro) 0.8639
- Accuracy 0.9618

By class:
              precision    recall  f1-score   support

       BRAND     0.9835    0.9934    0.9884      4867
       MODEL     0.9778    0.9705    0.9742      4818
                 0.7000    0.5714    0.6292        49

   micro avg     0.9796    0.9800    0.9798      9734
   macro avg     0.8871    0.8451    0.8639      9734
weighted avg     0.9793    0.9800    0.9796      9734

2023-04-12 22:32:46,127 ----------------------------------------------------------------------------------------------------





{'test_score': 0.9797658175842234,
 'dev_score_history': [0.9212310902451747,
  0.9200203769740194,
  0.943711967545639,
  0.9591266818989592,
  0.9678243105209396,
  0.970808576595195,
  0.9704533876719308,
  0.970130201684963,
  0.9724910850738665,
  0.9770525242223355],
 'train_loss_history': [0.5183333705405991,
  0.12473568766611935,
  0.08927567820461128,
  0.07072963822062141,
  0.05991144544678809,
  0.05064307495986278,
  0.04441598527547627,
  0.03982506934487968,
  0.03625268677215024,
  0.03203890563886483],
 'dev_loss_history': [0.13717596232891083,
  0.08285310119390488,
  0.06537532061338425,
  0.0508183054625988,
  0.043796606361866,
  0.050754088908433914,
  0.035582661628723145,
  0.03311051428318024,
  0.033190421760082245,
  0.027923069894313812]}

## Testing


#### Testing first the initial toy model

In [4]:
texts = [
    "Laptop Dell Inspiron X546",
    'Black Pelikan Pencil 16mm',
    'Battery Smart Energy by Energizer',
    'Fijutsu DSLR Camera 156p',
    "Genuine Paul Smith Men's Belt-Leather Woven Plait Belt/BNWT/Sz: 36'/RRP:110.00",
    'Computer HP X80 Intel Xeon',
    'Smart Watch Apple',
    'Computer Big Hewelett-Packard Intel Xeon',
    '24 Buttermilk oz Oroweat Bread,',
    'Black pencil Vertex',
    'Wireless mouse MacTech',
    'Smart Mouse Logitech',
    'Brother Printer V167 Black Ink',
]


trained_model_path = 'example-ner/best-model.pt'
flair_model =  SequenceTagger.load(trained_model_path)
for t in texts : 
    # create example sentence
    sentence = Sentence(t)
    # predict the tags
    flair_model.predict(sentence)
    print(sentence.to_tagged_string())


2023-04-25 01:38:31,690 SequenceTagger predicts: Dictionary with 15 tags: O, S-BRAND, B-BRAND, E-BRAND, I-BRAND, S-MODEL, B-MODEL, E-MODEL, I-MODEL, S-, B-, E-, I-, <START>, <STOP>
Sentence[4]: "Laptop Dell Inspiron X546" → ["Dell Inspiron X546"/MODEL]
Sentence[4]: "Black Pelikan Pencil 16mm" → ["Pelikan Pencil 16mm"/MODEL]
Sentence[5]: "Battery Smart Energy by Energizer" → ["Energizer"/MODEL]
Sentence[4]: "Fijutsu DSLR Camera 156p"
Sentence[19]: "Genuine Paul Smith Men's Belt-Leather Woven Plait Belt/BNWT/Sz: 36'/RRP:110.00" → ["Genuine"/BRAND]
Sentence[5]: "Computer HP X80 Intel Xeon" → ["Intel Xeon"/MODEL]
Sentence[3]: "Smart Watch Apple"
Sentence[5]: "Computer Big Hewelett-Packard Intel Xeon" → ["Intel Xeon"/MODEL]
Sentence[6]: "24 Buttermilk oz Oroweat Bread,"
Sentence[3]: "Black pencil Vertex"
Sentence[3]: "Wireless mouse MacTech"
Sentence[3]: "Smart Mouse Logitech"
Sentence[5]: "Brother Printer V167 Black Ink"


#### Testing the complete model

In [5]:
trained_model_path = 'models/flair/flair-ner-amazon/best-model.pt'
flair_model =  SequenceTagger.load(trained_model_path)

for t in texts : 
    sentence = Sentence(t)
    flair_model.predict(sentence)
    print(sentence.to_tagged_string())


2023-04-25 01:38:37,697 SequenceTagger predicts: Dictionary with 15 tags: O, S-BRAND, B-BRAND, E-BRAND, I-BRAND, S-MODEL, B-MODEL, E-MODEL, I-MODEL, S-, B-, E-, I-, <START>, <STOP>
Sentence[4]: "Laptop Dell Inspiron X546" → ["Dell Inspiron X546"/MODEL]
Sentence[4]: "Black Pelikan Pencil 16mm" → ["Pelikan Pencil 16mm"/MODEL]
Sentence[5]: "Battery Smart Energy by Energizer" → ["Energizer"/MODEL]
Sentence[4]: "Fijutsu DSLR Camera 156p" → ["Fijutsu"/BRAND]
Sentence[19]: "Genuine Paul Smith Men's Belt-Leather Woven Plait Belt/BNWT/Sz: 36'/RRP:110.00" → ["Genuine"/BRAND]
Sentence[5]: "Computer HP X80 Intel Xeon" → ["Intel Xeon"/MODEL]
Sentence[3]: "Smart Watch Apple"
Sentence[5]: "Computer Big Hewelett-Packard Intel Xeon" → ["Hewelett-Packard Intel Xeon"/MODEL]
Sentence[6]: "24 Buttermilk oz Oroweat Bread," → ["24"/MODEL, "Buttermilk", "oz"/MODEL, "Oroweat"/MODEL]
Sentence[3]: "Black pencil Vertex" → ["Black"/BRAND, "pencil Vertex"/MODEL]
Sentence[3]: "Wireless mouse MacTech"
Sentence[3]: "S