In [1]:
!pip install flair &> /dev/null

In [2]:
from glob import glob
dataset = glob("../input/twitter-loc/gold-random-json/*")

## Data Preprocessing

In [3]:
import json
from itertools import product
from flair.data import Sentence
from flair.tokenization import TokenizerWrapper
from nltk import wordpunct_tokenize

train_corpus = []
dev_corpus = []

for folder,corpus_type in product(dataset,["train","dev"]):
    for line in open(folder + '/' + corpus_type + '.jsonl'):
        obj = json.loads(line)
        sentence = Sentence(obj["text"],use_tokenizer = TokenizerWrapper(wordpunct_tokenize))

        for span_labels in obj["location_mentions"]:
            s,e = span_labels["start_offset"],span_labels["end_offset"]
            s_tok,e_tok = -1,-1
            for i,x in zip(range(len(sentence)),sentence):
                if s <= x.start_pos and x.end_pos <= e :
                    s_tok = s_tok if s_tok != -1 else i
                    e_tok = i
            try:
                sentence[s_tok:e_tok + 1].add_label("ner",span_labels["type"])  
            except:
                pass
        globals()[corpus_type + '_corpus'].append(sentence)
    
    

## Training

In [4]:
from flair.models import SequenceTagger
from flair.embeddings import WordEmbeddings,FlairEmbeddings,StackedEmbeddings,TransformerWordEmbeddings
from flair.trainers import ModelTrainer
from flair.data import Corpus

corpus = Corpus(train_corpus,dev_corpus) # store train and test data
corpus.downsample(0.35) #downsample to 35% 

<flair.data.Corpus at 0x7f59c6207710>

### GLOVE + BiLSTM + CRF

In [5]:
label_type = 'ner' #Name-Entity Recognition (NER): It can recognise whether a 
                    #word represents a person, location or names in the text.


label_dict = corpus.make_label_dictionary(label_type=label_type) #Create a label dictionary from ner and data
print(label_dict)

embedding_types = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]
#The three embedding models will be concatenated and should give state of the art results.
#Document Embeddings generate one embedding for an entire text. 
#The produced embeddings are PyTorch vectors. 
#There are two different methods using the word embeddings to obtain a document embedding

embeddings = StackedEmbeddings(embeddings=embedding_types)

tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

trainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/sota-ner-flair',
              learning_rate=0.08,
              mini_batch_size=32,
              max_epochs=16)

2022-11-14 10:01:35,781 Computing label dictionary. Progress:


4534it [00:00, 51169.32it/s]

2022-11-14 10:01:35,878 Dictionary created for label 'ner' with 13 values: Country (seen 1618 times), State (seen 1338 times), City/town (seen 1103 times), Island (seen 324 times), County (seen 90 times), Human-made Point-of-Interest (seen 64 times), District (seen 54 times), Natural Point-of-Interest (seen 42 times), Continent (seen 27 times), Neighborhood (seen 19 times), Road/street (seen 19 times), Other locations (seen 15 times)
Dictionary with 13 tags: <unk>, Country, State, City/town, Island, County, Human-made Point-of-Interest, District, Natural Point-of-Interest, Continent, Neighborhood, Road/street, Other locations





2022-11-14 10:01:36,064 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmp9e7sa6ox


100%|██████████| 160000128/160000128 [00:04<00:00, 37761289.26B/s]

2022-11-14 10:01:40,483 copying /tmp/tmp9e7sa6ox to cache at /root/.flair/embeddings/glove.gensim.vectors.npy
2022-11-14 10:01:40,659 removing temp file /tmp/tmp9e7sa6ox





2022-11-14 10:01:40,860 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim not found in cache, downloading to /tmp/tmp0xkydsr1


100%|██████████| 21494764/21494764 [00:00<00:00, 36385654.66B/s]

2022-11-14 10:01:41,620 copying /tmp/tmp0xkydsr1 to cache at /root/.flair/embeddings/glove.gensim





2022-11-14 10:01:41,648 removing temp file /tmp/tmp0xkydsr1
2022-11-14 10:01:46,581 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-forward-0.4.1.pt not found in cache, downloading to /tmp/tmpf6ckilmj


100%|██████████| 73034624/73034624 [00:01<00:00, 36793745.11B/s]

2022-11-14 10:01:48,731 copying /tmp/tmpf6ckilmj to cache at /root/.flair/embeddings/news-forward-0.4.1.pt





2022-11-14 10:01:48,814 removing temp file /tmp/tmpf6ckilmj
2022-11-14 10:01:56,079 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-backward-0.4.1.pt not found in cache, downloading to /tmp/tmp_ouezfud


100%|██████████| 73034575/73034575 [00:02<00:00, 35220196.65B/s]

2022-11-14 10:01:58,319 copying /tmp/tmp_ouezfud to cache at /root/.flair/embeddings/news-backward-0.4.1.pt
2022-11-14 10:01:58,399 removing temp file /tmp/tmp_ouezfud





2022-11-14 10:01:58,658 SequenceTagger predicts: Dictionary with 49 tags: O, S-Country, B-Country, E-Country, I-Country, S-State, B-State, E-State, I-State, S-City/town, B-City/town, E-City/town, I-City/town, S-Island, B-Island, E-Island, I-Island, S-County, B-County, E-County, I-County, S-Human-made Point-of-Interest, B-Human-made Point-of-Interest, E-Human-made Point-of-Interest, I-Human-made Point-of-Interest, S-District, B-District, E-District, I-District, S-Natural Point-of-Interest, B-Natural Point-of-Interest, E-Natural Point-of-Interest, I-Natural Point-of-Interest, S-Continent, B-Continent, E-Continent, I-Continent, S-Neighborhood, B-Neighborhood, E-Neighborhood, I-Neighborhood, S-Road/street, B-Road/street, E-Road/street, I-Road/street, S-Other locations, B-Other locations, E-Other locations, I-Other locations
2022-11-14 10:01:58,931 ----------------------------------------------------------------------------------------------------
2022-11-14 10:01:58,932 Model: "SequenceTag

100%|██████████| 23/23 [00:07<00:00,  3.20it/s]

2022-11-14 10:02:41,093 Evaluating as a multi-label problem: False
2022-11-14 10:02:41,108 DEV : loss 0.15231506526470184 - f1-score (micro avg)  0.538
2022-11-14 10:02:41,190 BAD EPOCHS (no improvement): 0
2022-11-14 10:02:41,192 saving best model





2022-11-14 10:02:42,775 ----------------------------------------------------------------------------------------------------
2022-11-14 10:02:44,649 epoch 2 - iter 14/142 - loss 0.14358425 - samples/sec: 239.63 - lr: 0.080000
2022-11-14 10:02:46,488 epoch 2 - iter 28/142 - loss 0.14758484 - samples/sec: 243.86 - lr: 0.080000
2022-11-14 10:02:48,270 epoch 2 - iter 42/142 - loss 0.15304072 - samples/sec: 251.92 - lr: 0.080000
2022-11-14 10:02:50,001 epoch 2 - iter 56/142 - loss 0.14468650 - samples/sec: 259.18 - lr: 0.080000
2022-11-14 10:02:52,280 epoch 2 - iter 70/142 - loss 0.13846654 - samples/sec: 197.60 - lr: 0.080000
2022-11-14 10:02:54,033 epoch 2 - iter 84/142 - loss 0.13627222 - samples/sec: 255.88 - lr: 0.080000
2022-11-14 10:02:55,808 epoch 2 - iter 98/142 - loss 0.13372834 - samples/sec: 252.88 - lr: 0.080000
2022-11-14 10:02:58,022 epoch 2 - iter 112/142 - loss 0.13185661 - samples/sec: 202.54 - lr: 0.080000
2022-11-14 10:02:59,867 epoch 2 - iter 126/142 - loss 0.12998564 -

100%|██████████| 23/23 [00:03<00:00,  6.35it/s]

2022-11-14 10:03:05,533 Evaluating as a multi-label problem: False
2022-11-14 10:03:05,547 DEV : loss 0.09187287092208862 - f1-score (micro avg)  0.7531
2022-11-14 10:03:05,629 BAD EPOCHS (no improvement): 0





2022-11-14 10:03:05,631 saving best model
2022-11-14 10:03:07,519 ----------------------------------------------------------------------------------------------------
2022-11-14 10:03:09,373 epoch 3 - iter 14/142 - loss 0.11118566 - samples/sec: 242.65 - lr: 0.080000
2022-11-14 10:03:11,204 epoch 3 - iter 28/142 - loss 0.10677438 - samples/sec: 245.06 - lr: 0.080000
2022-11-14 10:03:13,042 epoch 3 - iter 42/142 - loss 0.10981308 - samples/sec: 244.02 - lr: 0.080000
2022-11-14 10:03:14,950 epoch 3 - iter 56/142 - loss 0.11186415 - samples/sec: 235.23 - lr: 0.080000
2022-11-14 10:03:16,752 epoch 3 - iter 70/142 - loss 0.11325270 - samples/sec: 248.88 - lr: 0.080000
2022-11-14 10:03:18,557 epoch 3 - iter 84/142 - loss 0.10966481 - samples/sec: 248.62 - lr: 0.080000
2022-11-14 10:03:20,308 epoch 3 - iter 98/142 - loss 0.10569062 - samples/sec: 256.19 - lr: 0.080000
2022-11-14 10:03:22,105 epoch 3 - iter 112/142 - loss 0.10651569 - samples/sec: 249.73 - lr: 0.080000
2022-11-14 10:03:23,932 

100%|██████████| 23/23 [00:03<00:00,  5.94it/s]

2022-11-14 10:03:30,046 Evaluating as a multi-label problem: False
2022-11-14 10:03:30,061 DEV : loss 0.0807940810918808 - f1-score (micro avg)  0.779
2022-11-14 10:03:30,143 BAD EPOCHS (no improvement): 0





2022-11-14 10:03:30,145 saving best model
2022-11-14 10:03:32,031 ----------------------------------------------------------------------------------------------------
2022-11-14 10:03:33,871 epoch 4 - iter 14/142 - loss 0.10651883 - samples/sec: 244.08 - lr: 0.080000
2022-11-14 10:03:35,918 epoch 4 - iter 28/142 - loss 0.09368855 - samples/sec: 219.16 - lr: 0.080000
2022-11-14 10:03:37,786 epoch 4 - iter 42/142 - loss 0.09191476 - samples/sec: 240.23 - lr: 0.080000
2022-11-14 10:03:39,650 epoch 4 - iter 56/142 - loss 0.09375563 - samples/sec: 240.65 - lr: 0.080000
2022-11-14 10:03:41,435 epoch 4 - iter 70/142 - loss 0.09100946 - samples/sec: 251.36 - lr: 0.080000
2022-11-14 10:03:43,314 epoch 4 - iter 84/142 - loss 0.09007810 - samples/sec: 238.81 - lr: 0.080000
2022-11-14 10:03:45,066 epoch 4 - iter 98/142 - loss 0.09111849 - samples/sec: 256.01 - lr: 0.080000
2022-11-14 10:03:47,043 epoch 4 - iter 112/142 - loss 0.09135131 - samples/sec: 226.95 - lr: 0.080000
2022-11-14 10:03:48,787 

100%|██████████| 23/23 [00:04<00:00,  5.64it/s]

2022-11-14 10:03:54,795 Evaluating as a multi-label problem: False
2022-11-14 10:03:54,809 DEV : loss 0.07420500367879868 - f1-score (micro avg)  0.79





2022-11-14 10:03:54,892 BAD EPOCHS (no improvement): 0
2022-11-14 10:03:54,894 saving best model
2022-11-14 10:03:56,783 ----------------------------------------------------------------------------------------------------
2022-11-14 10:03:58,810 epoch 5 - iter 14/142 - loss 0.09178285 - samples/sec: 221.51 - lr: 0.080000
2022-11-14 10:04:00,932 epoch 5 - iter 28/142 - loss 0.09209837 - samples/sec: 211.44 - lr: 0.080000
2022-11-14 10:04:02,792 epoch 5 - iter 42/142 - loss 0.08555943 - samples/sec: 241.66 - lr: 0.080000
2022-11-14 10:04:04,597 epoch 5 - iter 56/142 - loss 0.08336726 - samples/sec: 248.47 - lr: 0.080000
2022-11-14 10:04:06,324 epoch 5 - iter 70/142 - loss 0.08487884 - samples/sec: 259.92 - lr: 0.080000
2022-11-14 10:04:08,178 epoch 5 - iter 84/142 - loss 0.08794307 - samples/sec: 241.93 - lr: 0.080000
2022-11-14 10:04:10,146 epoch 5 - iter 98/142 - loss 0.08737433 - samples/sec: 228.02 - lr: 0.080000
2022-11-14 10:04:11,924 epoch 5 - iter 112/142 - loss 0.08603128 - samp

100%|██████████| 23/23 [00:03<00:00,  6.44it/s]

2022-11-14 10:04:19,379 Evaluating as a multi-label problem: False
2022-11-14 10:04:19,410 DEV : loss 0.06877637654542923 - f1-score (micro avg)  0.798





2022-11-14 10:04:19,561 BAD EPOCHS (no improvement): 0
2022-11-14 10:04:19,564 saving best model
2022-11-14 10:04:21,488 ----------------------------------------------------------------------------------------------------
2022-11-14 10:04:23,328 epoch 6 - iter 14/142 - loss 0.08334105 - samples/sec: 244.67 - lr: 0.080000
2022-11-14 10:04:25,101 epoch 6 - iter 28/142 - loss 0.07973664 - samples/sec: 252.94 - lr: 0.080000
2022-11-14 10:04:26,914 epoch 6 - iter 42/142 - loss 0.08182138 - samples/sec: 247.57 - lr: 0.080000
2022-11-14 10:04:28,785 epoch 6 - iter 56/142 - loss 0.08122394 - samples/sec: 239.71 - lr: 0.080000
2022-11-14 10:04:30,758 epoch 6 - iter 70/142 - loss 0.07996460 - samples/sec: 227.37 - lr: 0.080000
2022-11-14 10:04:33,025 epoch 6 - iter 84/142 - loss 0.08135600 - samples/sec: 198.14 - lr: 0.080000
2022-11-14 10:04:34,954 epoch 6 - iter 98/142 - loss 0.08133630 - samples/sec: 232.55 - lr: 0.080000
2022-11-14 10:04:36,801 epoch 6 - iter 112/142 - loss 0.07954041 - samp

100%|██████████| 23/23 [00:03<00:00,  6.28it/s]

2022-11-14 10:04:44,401 Evaluating as a multi-label problem: False
2022-11-14 10:04:44,416 DEV : loss 0.06611115485429764 - f1-score (micro avg)  0.8064





2022-11-14 10:04:44,497 BAD EPOCHS (no improvement): 0
2022-11-14 10:04:44,499 saving best model
2022-11-14 10:04:46,344 ----------------------------------------------------------------------------------------------------
2022-11-14 10:04:48,228 epoch 7 - iter 14/142 - loss 0.07866721 - samples/sec: 238.40 - lr: 0.080000
2022-11-14 10:04:50,054 epoch 7 - iter 28/142 - loss 0.07027915 - samples/sec: 246.15 - lr: 0.080000
2022-11-14 10:04:51,864 epoch 7 - iter 42/142 - loss 0.07297193 - samples/sec: 247.98 - lr: 0.080000
2022-11-14 10:04:53,891 epoch 7 - iter 56/142 - loss 0.07214250 - samples/sec: 221.23 - lr: 0.080000
2022-11-14 10:04:55,720 epoch 7 - iter 70/142 - loss 0.07339500 - samples/sec: 245.34 - lr: 0.080000
2022-11-14 10:04:57,588 epoch 7 - iter 84/142 - loss 0.07375202 - samples/sec: 240.27 - lr: 0.080000
2022-11-14 10:04:59,414 epoch 7 - iter 98/142 - loss 0.07396623 - samples/sec: 245.68 - lr: 0.080000
2022-11-14 10:05:01,222 epoch 7 - iter 112/142 - loss 0.07455242 - samp

100%|██████████| 23/23 [00:04<00:00,  5.56it/s]

2022-11-14 10:05:09,855 Evaluating as a multi-label problem: False
2022-11-14 10:05:09,869 DEV : loss 0.06359512358903885 - f1-score (micro avg)  0.8236
2022-11-14 10:05:09,951 BAD EPOCHS (no improvement): 0





2022-11-14 10:05:09,953 saving best model
2022-11-14 10:05:11,821 ----------------------------------------------------------------------------------------------------
2022-11-14 10:05:13,640 epoch 8 - iter 14/142 - loss 0.07063536 - samples/sec: 246.94 - lr: 0.080000
2022-11-14 10:05:15,633 epoch 8 - iter 28/142 - loss 0.07246922 - samples/sec: 225.06 - lr: 0.080000
2022-11-14 10:05:17,570 epoch 8 - iter 42/142 - loss 0.07603410 - samples/sec: 231.69 - lr: 0.080000
2022-11-14 10:05:19,412 epoch 8 - iter 56/142 - loss 0.07445316 - samples/sec: 243.50 - lr: 0.080000
2022-11-14 10:05:21,151 epoch 8 - iter 70/142 - loss 0.07329504 - samples/sec: 257.99 - lr: 0.080000
2022-11-14 10:05:23,012 epoch 8 - iter 84/142 - loss 0.07300103 - samples/sec: 241.14 - lr: 0.080000
2022-11-14 10:05:24,791 epoch 8 - iter 98/142 - loss 0.07339363 - samples/sec: 252.13 - lr: 0.080000
2022-11-14 10:05:26,776 epoch 8 - iter 112/142 - loss 0.07267391 - samples/sec: 226.11 - lr: 0.080000
2022-11-14 10:05:28,503 

100%|██████████| 23/23 [00:03<00:00,  6.50it/s]

2022-11-14 10:05:34,143 Evaluating as a multi-label problem: False
2022-11-14 10:05:34,159 DEV : loss 0.06726896017789841 - f1-score (micro avg)  0.8141





2022-11-14 10:05:34,242 BAD EPOCHS (no improvement): 1
2022-11-14 10:05:34,244 ----------------------------------------------------------------------------------------------------
2022-11-14 10:05:36,584 epoch 9 - iter 14/142 - loss 0.06983580 - samples/sec: 192.00 - lr: 0.080000
2022-11-14 10:05:38,561 epoch 9 - iter 28/142 - loss 0.06832488 - samples/sec: 227.13 - lr: 0.080000
2022-11-14 10:05:40,372 epoch 9 - iter 42/142 - loss 0.07108264 - samples/sec: 247.80 - lr: 0.080000
2022-11-14 10:05:42,160 epoch 9 - iter 56/142 - loss 0.07056382 - samples/sec: 250.85 - lr: 0.080000
2022-11-14 10:05:43,979 epoch 9 - iter 70/142 - loss 0.06913719 - samples/sec: 246.68 - lr: 0.080000
2022-11-14 10:05:45,762 epoch 9 - iter 84/142 - loss 0.06903578 - samples/sec: 251.71 - lr: 0.080000
2022-11-14 10:05:47,657 epoch 9 - iter 98/142 - loss 0.06926506 - samples/sec: 236.68 - lr: 0.080000
2022-11-14 10:05:49,560 epoch 9 - iter 112/142 - loss 0.06770569 - samples/sec: 235.77 - lr: 0.080000
2022-11-14 

100%|██████████| 23/23 [00:03<00:00,  6.62it/s]

2022-11-14 10:05:56,974 Evaluating as a multi-label problem: False
2022-11-14 10:05:56,988 DEV : loss 0.0658353939652443 - f1-score (micro avg)  0.7945





2022-11-14 10:05:57,073 BAD EPOCHS (no improvement): 2
2022-11-14 10:05:57,074 ----------------------------------------------------------------------------------------------------
2022-11-14 10:05:59,112 epoch 10 - iter 14/142 - loss 0.06192536 - samples/sec: 220.25 - lr: 0.080000
2022-11-14 10:06:00,942 epoch 10 - iter 28/142 - loss 0.06669605 - samples/sec: 245.12 - lr: 0.080000
2022-11-14 10:06:02,775 epoch 10 - iter 42/142 - loss 0.06547793 - samples/sec: 244.79 - lr: 0.080000
2022-11-14 10:06:04,575 epoch 10 - iter 56/142 - loss 0.06539094 - samples/sec: 249.23 - lr: 0.080000
2022-11-14 10:06:06,561 epoch 10 - iter 70/142 - loss 0.06623971 - samples/sec: 225.91 - lr: 0.080000
2022-11-14 10:06:08,500 epoch 10 - iter 84/142 - loss 0.06657855 - samples/sec: 231.67 - lr: 0.080000
2022-11-14 10:06:10,532 epoch 10 - iter 98/142 - loss 0.06696823 - samples/sec: 220.78 - lr: 0.080000
2022-11-14 10:06:12,352 epoch 10 - iter 112/142 - loss 0.06661700 - samples/sec: 246.60 - lr: 0.080000
202

100%|██████████| 23/23 [00:04<00:00,  5.37it/s]

2022-11-14 10:06:20,461 Evaluating as a multi-label problem: False





2022-11-14 10:06:20,485 DEV : loss 0.060830965638160706 - f1-score (micro avg)  0.8309
2022-11-14 10:06:20,628 BAD EPOCHS (no improvement): 0
2022-11-14 10:06:20,630 saving best model
2022-11-14 10:06:22,526 ----------------------------------------------------------------------------------------------------
2022-11-14 10:06:24,433 epoch 11 - iter 14/142 - loss 0.07226787 - samples/sec: 235.83 - lr: 0.080000
2022-11-14 10:06:26,258 epoch 11 - iter 28/142 - loss 0.06534408 - samples/sec: 245.91 - lr: 0.080000
2022-11-14 10:06:28,177 epoch 11 - iter 42/142 - loss 0.06485031 - samples/sec: 233.73 - lr: 0.080000
2022-11-14 10:06:30,055 epoch 11 - iter 56/142 - loss 0.06357679 - samples/sec: 238.91 - lr: 0.080000
2022-11-14 10:06:32,051 epoch 11 - iter 70/142 - loss 0.06482482 - samples/sec: 224.68 - lr: 0.080000
2022-11-14 10:06:33,848 epoch 11 - iter 84/142 - loss 0.06494826 - samples/sec: 249.76 - lr: 0.080000
2022-11-14 10:06:35,650 epoch 11 - iter 98/142 - loss 0.06379204 - samples/sec:

100%|██████████| 23/23 [00:04<00:00,  5.34it/s]

2022-11-14 10:06:46,102 Evaluating as a multi-label problem: False
2022-11-14 10:06:46,117 DEV : loss 0.05924300476908684 - f1-score (micro avg)  0.8092





2022-11-14 10:06:46,199 BAD EPOCHS (no improvement): 1
2022-11-14 10:06:46,200 ----------------------------------------------------------------------------------------------------
2022-11-14 10:06:48,089 epoch 12 - iter 14/142 - loss 0.06206797 - samples/sec: 237.63 - lr: 0.080000
2022-11-14 10:06:49,978 epoch 12 - iter 28/142 - loss 0.06357106 - samples/sec: 237.46 - lr: 0.080000
2022-11-14 10:06:51,816 epoch 12 - iter 42/142 - loss 0.06152739 - samples/sec: 244.12 - lr: 0.080000
2022-11-14 10:06:53,788 epoch 12 - iter 56/142 - loss 0.06095086 - samples/sec: 227.55 - lr: 0.080000
2022-11-14 10:06:55,628 epoch 12 - iter 70/142 - loss 0.06143311 - samples/sec: 244.18 - lr: 0.080000
2022-11-14 10:06:57,430 epoch 12 - iter 84/142 - loss 0.06267059 - samples/sec: 249.05 - lr: 0.080000
2022-11-14 10:06:59,250 epoch 12 - iter 98/142 - loss 0.06024684 - samples/sec: 246.41 - lr: 0.080000
2022-11-14 10:07:00,975 epoch 12 - iter 112/142 - loss 0.06148390 - samples/sec: 260.17 - lr: 0.080000
202

100%|██████████| 23/23 [00:03<00:00,  6.11it/s]

2022-11-14 10:07:09,308 Evaluating as a multi-label problem: False
2022-11-14 10:07:09,332 DEV : loss 0.057672467082738876 - f1-score (micro avg)  0.8126





2022-11-14 10:07:09,488 BAD EPOCHS (no improvement): 2
2022-11-14 10:07:09,492 ----------------------------------------------------------------------------------------------------
2022-11-14 10:07:11,619 epoch 13 - iter 14/142 - loss 0.05901926 - samples/sec: 211.04 - lr: 0.080000
2022-11-14 10:07:13,450 epoch 13 - iter 28/142 - loss 0.06132986 - samples/sec: 244.98 - lr: 0.080000
2022-11-14 10:07:15,226 epoch 13 - iter 42/142 - loss 0.05967694 - samples/sec: 252.68 - lr: 0.080000
2022-11-14 10:07:17,264 epoch 13 - iter 56/142 - loss 0.05966500 - samples/sec: 220.10 - lr: 0.080000
2022-11-14 10:07:19,165 epoch 13 - iter 70/142 - loss 0.05960490 - samples/sec: 236.09 - lr: 0.080000
2022-11-14 10:07:20,929 epoch 13 - iter 84/142 - loss 0.06022265 - samples/sec: 254.28 - lr: 0.080000
2022-11-14 10:07:22,679 epoch 13 - iter 98/142 - loss 0.06055963 - samples/sec: 256.34 - lr: 0.080000
2022-11-14 10:07:24,535 epoch 13 - iter 112/142 - loss 0.05982460 - samples/sec: 241.80 - lr: 0.080000
202

100%|██████████| 23/23 [00:04<00:00,  5.58it/s]

2022-11-14 10:07:32,670 Evaluating as a multi-label problem: False
2022-11-14 10:07:32,685 DEV : loss 0.05732309818267822 - f1-score (micro avg)  0.8347





2022-11-14 10:07:32,768 BAD EPOCHS (no improvement): 0
2022-11-14 10:07:32,770 saving best model
2022-11-14 10:07:34,606 ----------------------------------------------------------------------------------------------------
2022-11-14 10:07:36,415 epoch 14 - iter 14/142 - loss 0.06302488 - samples/sec: 248.84 - lr: 0.080000
2022-11-14 10:07:38,474 epoch 14 - iter 28/142 - loss 0.06295571 - samples/sec: 217.86 - lr: 0.080000
2022-11-14 10:07:40,314 epoch 14 - iter 42/142 - loss 0.05704805 - samples/sec: 244.24 - lr: 0.080000
2022-11-14 10:07:42,416 epoch 14 - iter 56/142 - loss 0.05725535 - samples/sec: 213.38 - lr: 0.080000
2022-11-14 10:07:44,275 epoch 14 - iter 70/142 - loss 0.05705646 - samples/sec: 241.33 - lr: 0.080000
2022-11-14 10:07:46,036 epoch 14 - iter 84/142 - loss 0.05730392 - samples/sec: 254.73 - lr: 0.080000
2022-11-14 10:07:47,876 epoch 14 - iter 98/142 - loss 0.05650177 - samples/sec: 243.83 - lr: 0.080000
2022-11-14 10:07:49,928 epoch 14 - iter 112/142 - loss 0.0556688

100%|██████████| 23/23 [00:03<00:00,  6.65it/s]

2022-11-14 10:07:57,297 Evaluating as a multi-label problem: False
2022-11-14 10:07:57,314 DEV : loss 0.060734156519174576 - f1-score (micro avg)  0.8255





2022-11-14 10:07:57,398 BAD EPOCHS (no improvement): 1
2022-11-14 10:07:57,399 ----------------------------------------------------------------------------------------------------
2022-11-14 10:07:59,279 epoch 15 - iter 14/142 - loss 0.05927442 - samples/sec: 238.72 - lr: 0.080000
2022-11-14 10:08:01,291 epoch 15 - iter 28/142 - loss 0.06028537 - samples/sec: 222.99 - lr: 0.080000
2022-11-14 10:08:03,036 epoch 15 - iter 42/142 - loss 0.05851002 - samples/sec: 257.16 - lr: 0.080000
2022-11-14 10:08:04,876 epoch 15 - iter 56/142 - loss 0.05619488 - samples/sec: 243.77 - lr: 0.080000
2022-11-14 10:08:06,734 epoch 15 - iter 70/142 - loss 0.05584564 - samples/sec: 241.55 - lr: 0.080000
2022-11-14 10:08:08,572 epoch 15 - iter 84/142 - loss 0.05684150 - samples/sec: 244.08 - lr: 0.080000
2022-11-14 10:08:10,485 epoch 15 - iter 98/142 - loss 0.05667322 - samples/sec: 234.50 - lr: 0.080000
2022-11-14 10:08:12,534 epoch 15 - iter 112/142 - loss 0.05606869 - samples/sec: 219.22 - lr: 0.080000
202

100%|██████████| 23/23 [00:03<00:00,  6.61it/s]

2022-11-14 10:08:20,027 Evaluating as a multi-label problem: False
2022-11-14 10:08:20,043 DEV : loss 0.05579303205013275 - f1-score (micro avg)  0.8314





2022-11-14 10:08:20,127 BAD EPOCHS (no improvement): 2
2022-11-14 10:08:20,128 ----------------------------------------------------------------------------------------------------
2022-11-14 10:08:22,131 epoch 16 - iter 14/142 - loss 0.05219561 - samples/sec: 224.11 - lr: 0.080000
2022-11-14 10:08:23,880 epoch 16 - iter 28/142 - loss 0.05566263 - samples/sec: 256.57 - lr: 0.080000
2022-11-14 10:08:25,700 epoch 16 - iter 42/142 - loss 0.05528483 - samples/sec: 246.48 - lr: 0.080000
2022-11-14 10:08:27,479 epoch 16 - iter 56/142 - loss 0.05429878 - samples/sec: 252.20 - lr: 0.080000
2022-11-14 10:08:29,331 epoch 16 - iter 70/142 - loss 0.05385605 - samples/sec: 242.34 - lr: 0.080000
2022-11-14 10:08:31,101 epoch 16 - iter 84/142 - loss 0.05441628 - samples/sec: 253.49 - lr: 0.080000
2022-11-14 10:08:33,011 epoch 16 - iter 98/142 - loss 0.05514218 - samples/sec: 234.77 - lr: 0.080000
2022-11-14 10:08:34,848 epoch 16 - iter 112/142 - loss 0.05543856 - samples/sec: 244.26 - lr: 0.080000
202

100%|██████████| 23/23 [00:04<00:00,  5.48it/s]

2022-11-14 10:08:42,895 Evaluating as a multi-label problem: False
2022-11-14 10:08:42,910 DEV : loss 0.05537836253643036 - f1-score (micro avg)  0.8373





2022-11-14 10:08:42,992 BAD EPOCHS (no improvement): 0
2022-11-14 10:08:42,994 saving best model
2022-11-14 10:08:46,969 ----------------------------------------------------------------------------------------------------
2022-11-14 10:08:46,971 loading file resources/taggers/sota-ner-flair/best-model.pt
2022-11-14 10:08:48,518 SequenceTagger predicts: Dictionary with 51 tags: O, S-Country, B-Country, E-Country, I-Country, S-State, B-State, E-State, I-State, S-City/town, B-City/town, E-City/town, I-City/town, S-Island, B-Island, E-Island, I-Island, S-County, B-County, E-County, I-County, S-Human-made Point-of-Interest, B-Human-made Point-of-Interest, E-Human-made Point-of-Interest, I-Human-made Point-of-Interest, S-District, B-District, E-District, I-District, S-Natural Point-of-Interest, B-Natural Point-of-Interest, E-Natural Point-of-Interest, I-Natural Point-of-Interest, S-Continent, B-Continent, E-Continent, I-Continent, S-Neighborhood, B-Neighborhood, E-Neighborhood, I-Neighborhoo

100%|██████████| 16/16 [00:03<00:00,  4.32it/s]

2022-11-14 10:08:52,665 Evaluating as a multi-label problem: False
2022-11-14 10:08:52,679 0.8385	0.8175	0.8279	0.7455
2022-11-14 10:08:52,680 
Results:
- F-score (micro) 0.8279
- F-score (macro) 0.393
- Accuracy 0.7455

By class:
                              precision    recall  f1-score   support

                     Country     0.9453    0.9268    0.9360       205
                   City/town     0.6871    0.8000    0.7393       140
                       State     0.8824    0.8219    0.8511       146
                      Island     0.8485    0.8485    0.8485        33
                      County     0.7143    0.8333    0.7692         6
Human-made Point-of-Interest     0.0000    0.0000    0.0000        10
   Natural Point-of-Interest     0.0000    0.0000    0.0000         7
                   Continent     0.6667    0.5000    0.5714         4
                    District     0.0000    0.0000    0.0000         3
             Other locations     0.0000    0.0000    0.0000         




{'test_score': 0.8278985507246378,
 'dev_score_history': [0.5379959650302621,
  0.7531436135009927,
  0.779016393442623,
  0.7899686520376177,
  0.7979539641943734,
  0.806366047745358,
  0.8236040609137055,
  0.8140520896426408,
  0.7944636678200693,
  0.8309114927344782,
  0.8092409240924092,
  0.8125819134993447,
  0.8347266881028939,
  0.8254545454545456,
  0.8313773934527487,
  0.8372686662412252],
 'train_loss_history': [0.2619265969410788,
  0.12838247269544426,
  0.10402785370448052,
  0.09264882158382008,
  0.0852409312038254,
  0.08006780225204141,
  0.07475391906159923,
  0.07143794368417215,
  0.06782041281350115,
  0.06547560123382304,
  0.0631039529859215,
  0.06136859215317284,
  0.0587967874320589,
  0.05585249289286439,
  0.05542128599663147,
  0.05436841110678481],
 'dev_loss_history': [0.15231506526470184,
  0.09187287092208862,
  0.0807940810918808,
  0.07420500367879868,
  0.06877637654542923,
  0.06611115485429764,
  0.06359512358903885,
  0.06726896017789841,
  0

In [6]:
# del tagger,trainer,embeddings

### Transformer

In [7]:
# label_type = 'ner'

# label_dict = corpus.make_label_dictionary(label_type=label_type)
# print(label_dict)

# embeddings = TransformerWordEmbeddings(model='xlm-roberta-large',
#                                        layers="-1",
#                                        subtoken_pooling="first",
#                                        fine_tune=True,
#                                        use_context=True,
#                                        )

# tagger = SequenceTagger(hidden_size=256,
#                         embeddings=embeddings,
#                         tag_dictionary=label_dict,
#                         tag_type='ner',
#                         use_crf=False,
#                         use_rnn=False,
#                         reproject_embeddings=False,
#                         )

# trainer = ModelTrainer(tagger, corpus)

# trainer.fine_tune('resources/taggers/sota-ner-flair',
#               learning_rate=0.00085,
#               mini_batch_size=16,
#               max_epochs=8)

## INFERENCE

In [8]:
model = SequenceTagger.load('./resources/taggers/sota-ner-flair/final-model.pt') #loading model

2022-11-14 10:08:52,724 loading file ./resources/taggers/sota-ner-flair/final-model.pt
2022-11-14 10:08:54,297 SequenceTagger predicts: Dictionary with 51 tags: O, S-Country, B-Country, E-Country, I-Country, S-State, B-State, E-State, I-State, S-City/town, B-City/town, E-City/town, I-City/town, S-Island, B-Island, E-Island, I-Island, S-County, B-County, E-County, I-County, S-Human-made Point-of-Interest, B-Human-made Point-of-Interest, E-Human-made Point-of-Interest, I-Human-made Point-of-Interest, S-District, B-District, E-District, I-District, S-Natural Point-of-Interest, B-Natural Point-of-Interest, E-Natural Point-of-Interest, I-Natural Point-of-Interest, S-Continent, B-Continent, E-Continent, I-Continent, S-Neighborhood, B-Neighborhood, E-Neighborhood, I-Neighborhood, S-Road/street, B-Road/street, E-Road/street, I-Road/street, S-Other locations, B-Other locations, E-Other locations, I-Other locations, <START>


In [9]:
folder_to_check = '../input/twitter-loc/gold-random-json/kerala_floods_2018/' # END PATH WITH / symbol 

import os

for file in glob(folder_to_check + "*"):
    if "train" not in file:
        continue
    for line in open(file,'r'):
        obj = json.loads(line)
        s = Sentence(obj["text"],use_tokenizer = TokenizerWrapper(wordpunct_tokenize))
        model.predict(s) #model is being used to predict on sentence s
        outp =  {}
        outp["tweet_id"] = obj["tweet_id"]
        outp["location_mentions"] = []
        for e in s.labels:
            outp["location_mentions"].append({"text":e.data_point.text,"start_offset":e.data_point.start_position,"end_offset":e.data_point.end_position})
        with open("prediction.jsonl",'a') as out:
            json.dump(json.dumps(outp),out)
            out.write('\n')

In [10]:
numobs = 5

print("\033[1m Predictions \033[0m")
i = 0

for line in open("prediction.jsonl",'r'):
    print(json.loads(line))
    print()
    i += 1
    if i == numobs:
        break

print('\033[1m Ground Truth \033[0m')
i = 0

for line in open(folder_to_check + "train.jsonl"):
    #print(line)
    i += 1
    if i == numobs:
        break

[1m Predictions [0m
{"tweet_id": "1032071697221005312", "location_mentions": [{"text": "Kerala", "start_offset": 58, "end_offset": 64}, {"text": "Kerala", "start_offset": 204, "end_offset": 210}, {"text": "Karnataka", "start_offset": 223, "end_offset": 232}]}

{"tweet_id": "1034334390195822592", "location_mentions": []}

{"tweet_id": "1030975049510137856", "location_mentions": [{"text": "Kerala", "start_offset": 44, "end_offset": 50}]}

{"tweet_id": "1034481352341692417", "location_mentions": [{"text": "Kerala", "start_offset": 107, "end_offset": 113}, {"text": "India", "start_offset": 259, "end_offset": 264}]}

{"tweet_id": "1031529205673078784", "location_mentions": [{"text": "Kerala", "start_offset": 0, "end_offset": 6}, {"text": "india", "start_offset": 93, "end_offset": 98}, {"text": "kerala", "start_offset": 100, "end_offset": 106}, {"text": "Kerala", "start_offset": 208, "end_offset": 214}]}

[1m Ground Truth [0m
