In [4]:
from flair.data import TaggedCorpus, Sentence
from flair.data_fetcher import NLPTaskDataFetcher
import json
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings, ELMoEmbeddings
from typing import List
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric

from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter
from flair.hyperparameter import SequenceTaggerParamSelector

from pathlib import Path

In [5]:
import emoji

In [6]:
columns = {0: 'text', 1: 'pos'}

In [7]:
data_folder = "../data"

In [None]:
corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
    data_folder, columns,
    train_file='train.txt',
    dev_file='dev.txt',
    test_file='test.txt'
)

2019-05-13 22:18:04,632 Reading data from ../data
2019-05-13 22:18:04,635 Train: ../data/train.txt
2019-05-13 22:18:04,636 Dev: ../data/dev.txt
2019-05-13 22:18:04,638 Test: ../data/test.txt


In [6]:
corpus = corpus.downsample(0.5)

In [7]:
tag_dict = corpus.make_tag_dictionary(tag_type='pos')

In [8]:
len(corpus.train), len(corpus.dev), len(corpus.test)

(74364, 18592, 23240)

In [9]:
len(tag_dict.idx2item)

53

In [10]:
embedding_types: List[TokenEmbeddings] = [
    TransformerXLEmbeddings()
]

In [11]:
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [12]:
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dict,
                                        tag_type='pos',
                                        use_crf=True)

In [13]:
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [22]:
trainer.final_test(Path("../models/flair-256-transformerxl"), embeddings_in_memory=True, evaluation_metric=EvaluationMetric.MICRO_F1_SCORE, eval_mini_batch_size=32)

2019-05-12 20:34:11,920 ----------------------------------------------------------------------------------------------------
2019-05-12 20:34:11,922 Testing using best model ...
2019-05-12 20:34:11,927 loading file ../models/flair-256-transformerxl/best-model.pt
2019-05-12 20:35:30,750 MICRO_AVG: acc 0.2699 - f1-score 0.4251
2019-05-12 20:35:30,752 MACRO_AVG: acc 0.2246 - f1-score 0.35498979591836727
2019-05-12 20:35:30,753 :OK_hand:  tp: 23 - fp: 2 - fn: 186 - tn: 23 - precision: 0.9200 - recall: 0.1100 - accuracy: 0.1090 - f1-score: 0.1965
2019-05-12 20:35:30,754 :backhand_index_pointing_down: tp: 51 - fp: 16 - fn: 168 - tn: 51 - precision: 0.7612 - recall: 0.2329 - accuracy: 0.2170 - f1-score: 0.3567
2019-05-12 20:35:30,755 :beaming_face_with_smiling_eyes: tp: 22 - fp: 1 - fn: 224 - tn: 22 - precision: 0.9565 - recall: 0.0894 - accuracy: 0.0891 - f1-score: 0.1635
2019-05-12 20:35:30,756 :black_heart: tp: 152 - fp: 5 - fn: 156 - tn: 152 - precision: 0.9682 - recall: 0.4935 - accuracy

0.4251

In [None]:
trainer.train('../models/flair-256-transformerxl',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150, embeddings_in_memory=True)

2019-05-12 12:47:02,314 ----------------------------------------------------------------------------------------------------
2019-05-12 12:47:02,315 Evaluation method: MICRO_F1_SCORE
2019-05-12 12:47:02,321 ----------------------------------------------------------------------------------------------------
2019-05-12 12:47:04,059 epoch 1 - iter 0/2324 - loss 35.98537445
2019-05-12 12:53:22,468 epoch 1 - iter 232/2324 - loss 7.52863758
2019-05-12 12:59:40,561 epoch 1 - iter 464/2324 - loss 6.78123815
2019-05-12 13:05:59,419 epoch 1 - iter 696/2324 - loss 6.39579286
2019-05-12 13:12:19,134 epoch 1 - iter 928/2324 - loss 6.16336954
2019-05-12 13:18:40,650 epoch 1 - iter 1160/2324 - loss 6.01526224
2019-05-12 13:25:02,929 epoch 1 - iter 1392/2324 - loss 5.90229391
2019-05-12 13:31:24,649 epoch 1 - iter 1624/2324 - loss 5.81677344
2019-05-12 13:37:48,285 epoch 1 - iter 1856/2324 - loss 5.72475110
2019-05-12 13:44:11,549 epoch 1 - iter 2088/2324 - loss 5.65542308
2019-05-12 13:50:42,795 epoc

2019-05-12 15:05:20,405 epoch 7 - iter 2320/2324 - loss 4.27244315
2019-05-12 15:05:20,849 ----------------------------------------------------------------------------------------------------
2019-05-12 15:05:20,850 EPOCH 7 done: loss 4.2722 - lr 0.1000 - bad epochs 0
2019-05-12 15:06:25,457 DEV  : loss 4.05659056 - f-score 0.3447 - acc 0.2082
2019-05-12 15:07:41,751 TEST : loss 4.04601669 - f-score 0.3448 - acc 0.2083
2019-05-12 15:07:44,124 ----------------------------------------------------------------------------------------------------
2019-05-12 15:07:44,304 epoch 8 - iter 0/2324 - loss 5.19664955
2019-05-12 15:08:10,935 epoch 8 - iter 232/2324 - loss 4.24038854
2019-05-12 15:08:37,583 epoch 8 - iter 464/2324 - loss 4.21274176
2019-05-12 15:09:05,814 epoch 8 - iter 696/2324 - loss 4.19983644
2019-05-12 15:09:33,790 epoch 8 - iter 928/2324 - loss 4.19572618
2019-05-12 15:10:01,421 epoch 8 - iter 1160/2324 - loss 4.21299740
2019-05-12 15:10:29,119 epoch 8 - iter 1392/2324 - loss 4

In [24]:
sentence = Sentence('Alternatively, try using a stacked embedding with FlairEmbeddings and GloVe, over the full data, for 150 epochs.')

In [25]:
sentence_emoji = Sentence("I love the conversation with a lot of emoji")

In [26]:
sentence_bert = Sentence("All of the results in the paper can be replicated in at most 1 hour on a single Cloud TPU, or a few hours on a GPU, starting from the exact same pre-trained model")

In [23]:
model = SequenceTagger.load_from_file('../models/flair-256-transformerxl/best-model.pt')

2019-05-12 20:48:17,363 loading file ../models/flair-256-transformerxl/best-model.pt


In [27]:
tagger.predict(sentences=[sentence, sentence_bert, sentence_emoji])
print(sentence.to_tagged_string())
print(sentence_bert.to_tagged_string())
print(sentence_emoji.to_tagged_string())

Alternatively, try using a stacked embedding with FlairEmbeddings and GloVe, over the full data, for 150 epochs.
All of the results in the paper can be replicated in at most 1 hour on a single Cloud TPU, or a few hours on a GPU, starting from the exact same pre-trained model
I love the conversation with a lot of emoji


In [39]:
!head -n 26 ../../TwitterLM/tweets.txt

üíú Hello 
So yesterday I got my self a new car. God is awesome! üò≠üò≠ 
 Lord bustta I greet you üôåüèøüôåüèø
ADELIN MADE ME YOUR FRIEND üòî
it‚Äôs already 2019 but my heart still belongs to namlee ‚òπÔ∏èüíñ 
Rt or have bad luck for the rest of the school year ü§¶üèª‚Äç‚ôÇÔ∏è 
Idris Elba marries Sabrina Dhowre in beautiful Moroccan wedding üòç 
I‚Äôm still laughing at Hero‚Äôs reactionüòÅ 
190427 magical circus Kobe ü•∞‚ù§Ô∏èü•∞‚ù§Ô∏èü•∞‚ù§Ô∏èü•∞‚ù§Ô∏èü•∞‚ù§Ô∏èüòò 
Little date with my handsome ‚ù§Ô∏è 
I'm crying for this guy rn üëáüèΩüò•ü§¶üèΩüòÇüòÇü§£ü§£ 
ali, while i'm beside my brother: how was last night? nalasing ka ba? ‚òπÔ∏è‚òπÔ∏è‚òπÔ∏è
the way i cry over this man üò≠üíú 
Good night... Sad üò≠ 
Hello I'm Bea nice to meet you üíó SSFW EXO 
üòéLAST CHANCE TO BOOK: Half Price Flash Sale Exclusive - Mexico ‚úà - 
Yes, but the men in that movie are just üòçüòçüòçüòç
Imagine having this much of an impact on the planet, incredible 

In [28]:
emoji.emojize(tagger.predict(Sentence("Confession: I still haven‚Äôt updated my phone and sometimes all I see are squares with ?"))[0].to_tagged_string())

'Confession: I still haven‚Äôt updated my phone and sometimes all I see are squares with ?'

In [29]:
sentence_emoji.to_tagged_string()

'I love the conversation with a lot of emoji'

In [30]:
from flair.visual.training_curves import Plotter

In [33]:
plotter = Plotter()
plotter.plot_training_curves('../models/flair-256-elmo/loss.tsv')
#plotter.plot_weights('../models/flair-testing/weights.txt')