In [2]:
!pip install flair

Collecting flair
  Using cached flair-0.9-py3-none-any.whl (319 kB)
Collecting gdown==3.12.2
  Using cached gdown-3.12.2-py3-none-any.whl
Collecting ftfy
  Using cached ftfy-6.0.3-py3-none-any.whl
Collecting hyperopt>=0.1.1
  Using cached hyperopt-0.2.5-py2.py3-none-any.whl (965 kB)
Collecting gensim<=3.8.3,>=3.4.0
  Using cached gensim-3.8.3-cp38-cp38-macosx_11_0_arm64.whl
Collecting sqlitedict>=1.6.0
  Using cached sqlitedict-1.7.0-py3-none-any.whl
Collecting janome
  Using cached Janome-0.4.1-py2.py3-none-any.whl (19.7 MB)
Collecting conllu>=4.0
  Using cached conllu-4.4.1-py2.py3-none-any.whl (15 kB)
Collecting langdetect
  Using cached langdetect-1.0.9-py3-none-any.whl
Collecting segtok>=1.5.7
  Using cached segtok-1.5.10-py3-none-any.whl
Collecting mpld3==0.3
  Using cached mpld3-0.3-py3-none-any.whl
Collecting lxml
  Using cached lxml-4.6.3-cp38-cp38-macosx_11_0_arm64.whl
Collecting bpemb>=0.3.2
  Using cached bpemb-0.3.3-py3-none-any.whl (19 kB)
Collecting torch!=1.8,>=1.5.0
  

In [6]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '/Users/shruthi223/Documents/Projects/CAIR/auto-labeled-corpus/corpus/'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')

2021-09-21 20:49:50,443 Reading data from /Users/shruthi223/Documents/Projects/CAIR/auto-labeled-corpus/corpus
2021-09-21 20:49:50,445 Train: /Users/shruthi223/Documents/Projects/CAIR/auto-labeled-corpus/corpus/train.txt
2021-09-21 20:49:50,446 Dev: /Users/shruthi223/Documents/Projects/CAIR/auto-labeled-corpus/corpus/dev.txt
2021-09-21 20:49:50,446 Test: /Users/shruthi223/Documents/Projects/CAIR/auto-labeled-corpus/corpus/test.txt


In [7]:
# tag to predict
tag_type = 'ner'
# make tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [8]:
from flair.embeddings import WordEmbeddings, StackedEmbeddings, TokenEmbeddings
from typing import List
embedding_types : List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
        ## other embeddings
        ]
embeddings : StackedEmbeddings = StackedEmbeddings(
                                 embeddings=embedding_types)

In [10]:
from flair.models import SequenceTagger
tagger : SequenceTagger = SequenceTagger(hidden_size=256,
                                       embeddings=embeddings,
                                       tag_dictionary=tag_dictionary,
                                       tag_type=tag_type,
                                       use_crf=True)
print(tagger)

SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=100, out_features=100, bias=True)
  (rnn): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=26, bias=True)
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)


In [12]:
from flair.trainers import ModelTrainer
trainer : ModelTrainer = ModelTrainer(tagger, corpus)
trainer.train('resources/taggers/example-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=30)

2021-09-22 10:45:51,265 ----------------------------------------------------------------------------------------------------
2021-09-22 10:45:51,266 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=100, out_features=100, bias=True)
  (rnn): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=26, bias=True)
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2021-09-22 10:45:51,267 ----------------------------------------------------------------------------------------------------
2021-09-22 10:45:51,267 Corpus: "Corpus: 12537 train + 1500 dev + 1483 test sentences"
2021-09-22 10:45:51,268 ----------------------------------------------------------------------------------------------------
2021-09-22 10:45:51,269 Parameters:
2021-09-22 10:45:51,269  - learni



2021-09-22 10:46:05,057 epoch 1 - iter 39/392 - loss 0.12851084 - samples/sec: 90.61 - lr: 0.100000
2021-09-22 10:46:17,626 epoch 1 - iter 78/392 - loss 0.12961779 - samples/sec: 99.31 - lr: 0.100000
2021-09-22 10:46:32,363 epoch 1 - iter 117/392 - loss 0.12443348 - samples/sec: 84.70 - lr: 0.100000
2021-09-22 10:46:45,855 epoch 1 - iter 156/392 - loss 0.12008491 - samples/sec: 92.51 - lr: 0.100000
2021-09-22 10:47:00,727 epoch 1 - iter 195/392 - loss 0.11940493 - samples/sec: 83.92 - lr: 0.100000
2021-09-22 10:47:18,216 epoch 1 - iter 234/392 - loss 0.12093187 - samples/sec: 71.37 - lr: 0.100000
2021-09-22 10:47:36,039 epoch 1 - iter 273/392 - loss 0.12486539 - samples/sec: 70.03 - lr: 0.100000
2021-09-22 10:47:51,994 epoch 1 - iter 312/392 - loss 0.12331967 - samples/sec: 78.23 - lr: 0.100000
2021-09-22 10:48:07,240 epoch 1 - iter 351/392 - loss 0.12612464 - samples/sec: 81.87 - lr: 0.100000
2021-09-22 10:48:21,960 epoch 1 - iter 390/392 - loss 0.12521929 - samples/sec: 84.79 - lr: 0

2021-09-22 11:02:38,121 epoch 6 - iter 351/392 - loss 0.10953034 - samples/sec: 67.64 - lr: 0.100000
2021-09-22 11:02:57,720 epoch 6 - iter 390/392 - loss 0.10914575 - samples/sec: 63.68 - lr: 0.100000
2021-09-22 11:02:58,312 ----------------------------------------------------------------------------------------------------
2021-09-22 11:02:58,312 EPOCH 6 done: loss 0.1091 - lr 0.1000000
2021-09-22 11:03:04,228 DEV : loss 0.0814012885093689 - f1-score (micro avg)  0.9274
2021-09-22 11:03:04,268 BAD EPOCHS (no improvement): 2
2021-09-22 11:03:04,269 ----------------------------------------------------------------------------------------------------
2021-09-22 11:03:22,945 epoch 7 - iter 39/392 - loss 0.11026889 - samples/sec: 66.83 - lr: 0.100000
2021-09-22 11:03:39,867 epoch 7 - iter 78/392 - loss 0.10887351 - samples/sec: 73.76 - lr: 0.100000
2021-09-22 11:03:57,854 epoch 7 - iter 117/392 - loss 0.10809884 - samples/sec: 69.39 - lr: 0.100000
2021-09-22 11:04:16,835 epoch 7 - iter 156

2021-09-22 11:19:30,984 epoch 12 - iter 78/392 - loss 0.09571301 - samples/sec: 68.97 - lr: 0.100000
2021-09-22 11:19:46,652 epoch 12 - iter 117/392 - loss 0.09597488 - samples/sec: 79.67 - lr: 0.100000
2021-09-22 11:20:01,837 epoch 12 - iter 156/392 - loss 0.09661747 - samples/sec: 82.20 - lr: 0.100000
2021-09-22 11:20:19,265 epoch 12 - iter 195/392 - loss 0.09573628 - samples/sec: 71.62 - lr: 0.100000
2021-09-22 11:20:36,866 epoch 12 - iter 234/392 - loss 0.09592448 - samples/sec: 70.92 - lr: 0.100000
2021-09-22 11:20:56,249 epoch 12 - iter 273/392 - loss 0.09700134 - samples/sec: 64.39 - lr: 0.100000
2021-09-22 11:21:15,249 epoch 12 - iter 312/392 - loss 0.09689482 - samples/sec: 65.69 - lr: 0.100000
2021-09-22 11:21:32,779 epoch 12 - iter 351/392 - loss 0.09790921 - samples/sec: 71.21 - lr: 0.100000
2021-09-22 11:21:49,859 epoch 12 - iter 390/392 - loss 0.09836886 - samples/sec: 73.08 - lr: 0.100000
2021-09-22 11:21:50,539 -----------------------------------------------------------

2021-09-22 11:37:24,144 epoch 17 - iter 390/392 - loss 0.08914246 - samples/sec: 69.64 - lr: 0.050000
2021-09-22 11:37:24,898 ----------------------------------------------------------------------------------------------------
2021-09-22 11:37:24,898 EPOCH 17 done: loss 0.0891 - lr 0.0500000
2021-09-22 11:37:31,589 DEV : loss 0.07696074992418289 - f1-score (micro avg)  0.9309
Epoch    17: reducing learning rate of group 0 to 2.5000e-02.
2021-09-22 11:37:31,630 BAD EPOCHS (no improvement): 4
2021-09-22 11:37:31,632 ----------------------------------------------------------------------------------------------------
2021-09-22 11:37:50,236 epoch 18 - iter 39/392 - loss 0.09076670 - samples/sec: 67.09 - lr: 0.025000
2021-09-22 11:38:12,408 epoch 18 - iter 78/392 - loss 0.08780959 - samples/sec: 56.29 - lr: 0.025000
2021-09-22 11:38:34,935 epoch 18 - iter 117/392 - loss 0.08629704 - samples/sec: 55.41 - lr: 0.025000
2021-09-22 11:38:53,697 epoch 18 - iter 156/392 - loss 0.08576338 - samples

2021-09-22 11:55:41,566 epoch 23 - iter 117/392 - loss 0.08481099 - samples/sec: 54.33 - lr: 0.025000
2021-09-22 11:56:00,905 epoch 23 - iter 156/392 - loss 0.08345166 - samples/sec: 64.54 - lr: 0.025000
2021-09-22 11:56:21,900 epoch 23 - iter 195/392 - loss 0.08358237 - samples/sec: 59.45 - lr: 0.025000
2021-09-22 11:56:39,828 epoch 23 - iter 234/392 - loss 0.08339453 - samples/sec: 69.62 - lr: 0.025000
2021-09-22 11:56:59,839 epoch 23 - iter 273/392 - loss 0.08324098 - samples/sec: 62.37 - lr: 0.025000
2021-09-22 11:57:18,560 epoch 23 - iter 312/392 - loss 0.08358450 - samples/sec: 66.67 - lr: 0.025000
2021-09-22 11:57:37,037 epoch 23 - iter 351/392 - loss 0.08320931 - samples/sec: 67.55 - lr: 0.025000
2021-09-22 11:57:58,243 epoch 23 - iter 390/392 - loss 0.08320747 - samples/sec: 58.86 - lr: 0.025000
2021-09-22 11:57:59,336 ----------------------------------------------------------------------------------------------------
2021-09-22 11:57:59,337 EPOCH 23 done: loss 0.0832 - lr 0.0

2021-09-22 12:14:22,667 ----------------------------------------------------------------------------------------------------
2021-09-22 12:14:22,668 EPOCH 28 done: loss 0.0817 - lr 0.0250000
2021-09-22 12:14:31,348 DEV : loss 0.07397507131099701 - f1-score (micro avg)  0.9316
Epoch    28: reducing learning rate of group 0 to 1.2500e-02.
2021-09-22 12:14:31,389 BAD EPOCHS (no improvement): 4
2021-09-22 12:14:31,390 ----------------------------------------------------------------------------------------------------
2021-09-22 12:14:48,519 epoch 29 - iter 39/392 - loss 0.08213504 - samples/sec: 72.87 - lr: 0.012500
2021-09-22 12:15:05,936 epoch 29 - iter 78/392 - loss 0.08257976 - samples/sec: 71.67 - lr: 0.012500
2021-09-22 12:15:25,119 epoch 29 - iter 117/392 - loss 0.08230385 - samples/sec: 65.07 - lr: 0.012500
2021-09-22 12:15:42,659 epoch 29 - iter 156/392 - loss 0.08160150 - samples/sec: 71.16 - lr: 0.012500
2021-09-22 12:16:01,033 epoch 29 - iter 195/392 - loss 0.08110265 - samples

{'test_score': 0.8338293781441779,
 'dev_score_history': [0.9202255109231853,
  0.9193832288953038,
  0.9243073003290987,
  0.9284515091980711,
  0.9257481648785997,
  0.9273537525641933,
  0.9301330289953279,
  0.9307164307164306,
  0.9316266896131368,
  0.9230661533076654,
  0.9296314705673506,
  0.9225361915367483,
  0.9288477234820256,
  0.9252954417557681,
  0.9316157422193831,
  0.9308772053883958,
  0.930865236292245,
  0.9333285973075692,
  0.9305995913478475,
  0.9334515785739624,
  0.9333711369435782,
  0.9310308444413181,
  0.9334895777848798,
  0.9340382500088705,
  0.9303982784774403,
  0.9338731443994602,
  0.9326009922041105,
  0.9315617420647474,
  0.9348881721192358,
  0.933951907072281],
 'train_loss_history': [0.1249649289268057,
  0.12191863914898227,
  0.11761029010843911,
  0.11501744417749608,
  0.1110260927974589,
  0.10914069368600972,
  0.10763223060699005,
  0.10486735150491752,
  0.10317782760427707,
  0.10261656646327061,
  0.10010406861402467,
  0.09845667

In [18]:
from flair.data import Sentence
from flair.models import SequenceTagger
# load the trained model
model = SequenceTagger.load("/Users/shruthi223/Documents/Projects/CAIR/resources/taggers/example-ner/final-model.pt")
# create example sentence
sentence = Sentence('This module exploits a memory trust issue in Apple QuickTime 7.6.7')
# predict the tags
model.predict(sentence)
print(sentence.to_tagged_string())

2021-09-22 12:31:33,096 loading file /Users/shruthi223/Documents/Projects/CAIR/resources/taggers/example-ner/final-model.pt
This module exploits a memory <B-relevant_term> trust issue in Apple <B-vendor> QuickTime <B-application> 7.6.7 <B-version>
