In [1]:
!pip install flair

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/77/e3/389c2dd8d0e6ca1d8fad11aa4940e8df6909a26a5d954c0eff01f0d78b57/flair-0.4.3-py3-none-any.whl (180kB)
[K     |█▉                              | 10kB 13.5MB/s eta 0:00:01[K     |███▋                            | 20kB 6.9MB/s eta 0:00:01[K     |█████▌                          | 30kB 9.6MB/s eta 0:00:01[K     |███████▎                        | 40kB 5.8MB/s eta 0:00:01[K     |█████████                       | 51kB 7.0MB/s eta 0:00:01[K     |███████████                     | 61kB 8.2MB/s eta 0:00:01[K     |████████████▊                   | 71kB 9.3MB/s eta 0:00:01[K     |██████████████▋                 | 81kB 10.3MB/s eta 0:00:01[K     |████████████████▍               | 92kB 11.4MB/s eta 0:00:01[K     |██████████████████▏             | 102kB 9.2MB/s eta 0:00:01[K     |████████████████████            | 112kB 9.2MB/s eta 0:00:01[K     |█████████████████████▉          | 122kB 9.2MB/s eta 0:00

In [0]:
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, BertEmbeddings
from typing import List

In [3]:
# 1. get the corpus
corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_INDONESIAN)

# 2. what tag do we want to predict?
tag_type = 'upos'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('id-crawl'),
    WordEmbeddings('id'),
    #WordEmbeddings('glove'),
    #BertEmbeddings('bert-base-multilingual-cased')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
from flair.models import SequenceTagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('resources/taggers/example-universal-pos',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=10)

  """Entry point for launching an IPython kernel.


2019-10-02 14:40:53,330 https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-dev.conllu not found in cache, downloading to /tmp/tmpageezpn7


979367B [00:00, 55400257.94B/s]          

2019-10-02 14:40:53,383 copying /tmp/tmpageezpn7 to cache at /root/.flair/datasets/ud_indonesian/id_gsd-ud-dev.conllu
2019-10-02 14:40:53,387 removing temp file /tmp/tmpageezpn7





2019-10-02 14:40:53,706 https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-test.conllu not found in cache, downloading to /tmp/tmprgx28vln


916721B [00:00, 53745496.39B/s]          

2019-10-02 14:40:53,763 copying /tmp/tmprgx28vln to cache at /root/.flair/datasets/ud_indonesian/id_gsd-ud-test.conllu
2019-10-02 14:40:53,768 removing temp file /tmp/tmprgx28vln





2019-10-02 14:40:54,630 https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-train.conllu not found in cache, downloading to /tmp/tmp3hkk8ykx


7591261B [00:00, 102164433.03B/s]         

2019-10-02 14:40:54,741 copying /tmp/tmp3hkk8ykx to cache at /root/.flair/datasets/ud_indonesian/id_gsd-ud-train.conllu
2019-10-02 14:40:54,756 removing temp file /tmp/tmp3hkk8ykx





2019-10-02 14:40:54,977 Reading data from /root/.flair/datasets/ud_indonesian
2019-10-02 14:40:54,978 Train: /root/.flair/datasets/ud_indonesian/id_gsd-ud-train.conllu
2019-10-02 14:40:54,982 Dev: /root/.flair/datasets/ud_indonesian/id_gsd-ud-dev.conllu
2019-10-02 14:40:54,984 Test: /root/.flair/datasets/ud_indonesian/id_gsd-ud-test.conllu


  return NLPTaskDataFetcher.load_ud_corpus(data_folder)
  sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(train_file)
  sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(test_file)
  sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(dev_file)


[b'<unk>', b'O', b'PROPN', b'AUX', b'DET', b'NOUN', b'PRON', b'VERB', b'ADP', b'PUNCT', b'ADV', b'CCONJ', b'SCONJ', b'NUM', b'ADJ', b'PART', b'SYM', b'X', b'<START>', b'<STOP>']
2019-10-02 14:40:58,802 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/id-crawl-fasttext-300d-1M.vectors.npy not found in cache, downloading to /tmp/tmpz8h2d6zj


100%|██████████| 1199998928/1199998928 [00:15<00:00, 75953887.42B/s]

2019-10-02 14:41:14,807 copying /tmp/tmpz8h2d6zj to cache at /root/.flair/embeddings/id-crawl-fasttext-300d-1M.vectors.npy





2019-10-02 14:41:38,862 removing temp file /tmp/tmpz8h2d6zj
2019-10-02 14:41:39,120 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/id-crawl-fasttext-300d-1M not found in cache, downloading to /tmp/tmp49zb9cr7


100%|██████████| 39636845/39636845 [00:03<00:00, 12208766.08B/s]

2019-10-02 14:41:42,546 copying /tmp/tmp49zb9cr7 to cache at /root/.flair/embeddings/id-crawl-fasttext-300d-1M
2019-10-02 14:41:42,607 removing temp file /tmp/tmp49zb9cr7



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


2019-10-02 14:41:52,005 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/id-wiki-fasttext-300d-1M.vectors.npy not found in cache, downloading to /tmp/tmpm7l0k8dr


100%|██████████| 360822128/360822128 [00:04<00:00, 73167430.57B/s]

2019-10-02 14:41:57,151 copying /tmp/tmpm7l0k8dr to cache at /root/.flair/embeddings/id-wiki-fasttext-300d-1M.vectors.npy





2019-10-02 14:42:04,509 removing temp file /tmp/tmpm7l0k8dr
2019-10-02 14:42:04,657 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/id-wiki-fasttext-300d-1M not found in cache, downloading to /tmp/tmpp681uda2


100%|██████████| 11638719/11638719 [00:03<00:00, 3110435.11B/s]

2019-10-02 14:42:08,579 copying /tmp/tmpp681uda2 to cache at /root/.flair/embeddings/id-wiki-fasttext-300d-1M
2019-10-02 14:42:08,597 removing temp file /tmp/tmpp681uda2





2019-10-02 14:42:12,948 ----------------------------------------------------------------------------------------------------
2019-10-02 14:42:12,950 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('id-crawl')
    (list_embedding_1): WordEmbeddings('id')
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=600, out_features=600, bias=True)
  (rnn): LSTM(600, 256, bidirectional=True)
  (linear): Linear(in_features=512, out_features=20, bias=True)
)"
2019-10-02 14:42:12,953 ----------------------------------------------------------------------------------------------------
2019-10-02 14:42:12,955 Corpus: "Corpus: 4477 train + 559 dev + 557 test sentences"
2019-10-02 14:42:12,957 ----------------------------------------------------------------------------------------------------
2019-10-02 14:42:12,959 Parameters:
2019-10-02 14:42:12,962  - learning_rate: "0.1"
2019-10-02 14

{'dev_loss_history': [tensor(12.8496),
  tensor(8.1575),
  tensor(6.8801),
  tensor(6.5519),
  tensor(6.3660),
  tensor(6.0623),
  tensor(5.9195),
  tensor(5.9177),
  tensor(5.7230),
  tensor(5.7690)],
 'dev_score_history': [0.8174,
  0.8888,
  0.9028,
  0.9049,
  0.9081,
  0.9141,
  0.9126,
  0.9134,
  0.9186,
  0.9156],
 'test_score': 0.9248,
 'train_loss_history': [29.67927223614284,
  14.246807411738805,
  11.646686921800885,
  10.323263042313712,
  9.848991067068917,
  9.221789254461015,
  9.06080003806523,
  8.921023992129735,
  8.455742127554757,
  8.433172522272383]}

In [6]:
from flair.data import Sentence

sentence = Sentence('saya dan dia kemarin pergi ke pasar bersama untuk membeli jeruk')
tag_pos = SequenceTagger.load('resources/taggers/example-universal-pos/best-model.pt')
tag_pos.predict(sentence)
print(sentence.to_tagged_string())

2019-10-02 15:21:09,495 loading file resources/taggers/example-universal-pos/best-model.pt
saya <PRON> dan <CCONJ> dia <PRON> kemarin <ADV> pergi <VERB> ke <ADP> pasar <NOUN> bersama <ADP> untuk <ADP> membeli <VERB> jeruk <NOUN>
