In [0]:
!pip install flair

In [0]:
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, BertEmbeddings
from typing import List

In [0]:
# 1. get the corpus
corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_INDONESIAN)

# 2. what tag do we want to predict?
tag_type = 'upos'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('id-crawl'),
    WordEmbeddings('id'),
    #WordEmbeddings('glove'),
    #BertEmbeddings('bert-base-multilingual-cased')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
from flair.models import SequenceTagger
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('resources/taggers/example-universal-pos',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=10)

2019-10-02 06:37:09,234 Reading data from /root/.flair/datasets/ud_indonesian
2019-10-02 06:37:09,236 Train: /root/.flair/datasets/ud_indonesian/id_gsd-ud-train.conllu
2019-10-02 06:37:09,239 Dev: /root/.flair/datasets/ud_indonesian/id_gsd-ud-dev.conllu
2019-10-02 06:37:09,241 Test: /root/.flair/datasets/ud_indonesian/id_gsd-ud-test.conllu


  
  return NLPTaskDataFetcher.load_ud_corpus(data_folder)
  sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(train_file)
  sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(test_file)
  sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(dev_file)


[b'<unk>', b'O', b'PROPN', b'AUX', b'DET', b'NOUN', b'PRON', b'VERB', b'ADP', b'PUNCT', b'ADV', b'CCONJ', b'SCONJ', b'NUM', b'ADJ', b'PART', b'SYM', b'X', b'<START>', b'<STOP>']


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


2019-10-02 06:37:26,876 ----------------------------------------------------------------------------------------------------
2019-10-02 06:37:26,883 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('id-crawl')
    (list_embedding_1): WordEmbeddings('id')
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=600, out_features=600, bias=True)
  (rnn): LSTM(600, 256, bidirectional=True)
  (linear): Linear(in_features=512, out_features=20, bias=True)
)"
2019-10-02 06:37:26,886 ----------------------------------------------------------------------------------------------------
2019-10-02 06:37:26,890 Corpus: "Corpus: 4477 train + 559 dev + 557 test sentences"
2019-10-02 06:37:26,893 ----------------------------------------------------------------------------------------------------
2019-10-02 06:37:26,895 Parameters:
2019-10-02 06:37:26,898  - learning_rate: "0.1"
2019-10-02 06

{'dev_loss_history': [tensor(12.8664, device='cuda:0'),
  tensor(8.6487, device='cuda:0'),
  tensor(7.5305, device='cuda:0'),
  tensor(6.5775, device='cuda:0'),
  tensor(6.3480, device='cuda:0'),
  tensor(6.1648, device='cuda:0'),
  tensor(5.9446, device='cuda:0'),
  tensor(5.9260, device='cuda:0'),
  tensor(5.7964, device='cuda:0'),
  tensor(5.7894, device='cuda:0')],
 'dev_score_history': [0.8175,
  0.8693,
  0.892,
  0.9075,
  0.91,
  0.9116,
  0.9156,
  0.9141,
  0.9167,
  0.9142],
 'test_score': 0.9222,
 'train_loss_history': [28.620167078290667,
  13.919212940761021,
  11.322017758233207,
  10.277066789354597,
  9.821163579395838,
  9.272978401184082,
  9.011836685453142,
  8.840235917908805,
  8.617510696819851,
  8.46943097795759]}

In [0]:
sentence = Sentence('saya dan dia kemarin pegi ke pasar bersama untuk membeli jeru')
tag_pos = SequenceTagger.load('resources/taggers/example-universal-pos/best-model.pt')
tag_pos.predict(sentence)
print(sentence.to_tagged_string())

2019-10-02 06:50:30,237 loading file resources/taggers/example-universal-pos/best-model.pt
saya <PRON> dan <CCONJ> dia <PRON> kemarin <VERB> pegi <NOUN> ke <ADP> pasar <NOUN> bersama <ADP> untuk <ADP> membeli <VERB> jeru <NOUN>
