# Legal Text Annotation and Named-Entity Recognition on Power of Attorney Documents

- Shubhangi Shrivastava
- Utsav Patel

In [1]:
!pip install flair

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/44/54/76374f9a448ca765446502e7f2bb53c976e9c055102290fe6f8b0b038b37/flair-0.4.1.tar.gz (78kB)
[K     |████████████████████████████████| 81kB 5.3MB/s 
Collecting segtok>=1.5.7 (from flair)
  Downloading https://files.pythonhosted.org/packages/1d/59/6ed78856ab99d2da04084b59e7da797972baa0efecb71546b16d48e49d9b/segtok-1.5.7.tar.gz
Collecting mpld3>=0.3 (from flair)
[?25l  Downloading https://files.pythonhosted.org/packages/91/95/a52d3a83d0a29ba0d6898f6727e9858fe7a43f6c2ce81a5fe7e05f0f4912/mpld3-0.3.tar.gz (788kB)
[K     |████████████████████████████████| 798kB 15.2MB/s 
Collecting sqlitedict>=1.6.0 (from flair)
  Downloading https://files.pythonhosted.org/packages/0f/1c/c757b93147a219cf1e25cef7e1ad9b595b7f802159493c45ce116521caff/sqlitedict-1.6.0.tar.gz
Collecting deprecated>=1.2.4 (from flair)
  Downloading https://files.pythonhosted.org/packages/9f/7a/003fa432f1e45625626549726c2fbb7a29baa764e9d1fdb2323a5d779f8

In [0]:
import flair
import numpy as np

from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from typing import List

from flair.models import SequenceTagger

from flair.trainers import ModelTrainer

from flair.visual.training_curves import Plotter

In [0]:
columns = {0: 'text', 1: 'pos', 2: 'ner'}

In [0]:
data_folder = './'

In [6]:
corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns,
                                                              train_file='train.txt')
#   ,
#                                                               test_file='test.txt',
#                                                               dev_file='dev.txt')
print(corpus)

2019-05-04 00:22:35,444 Reading data from .
2019-05-04 00:22:35,446 Train: train.txt
2019-05-04 00:22:35,447 Dev: None
2019-05-04 00:22:35,448 Test: None
2019-05-04 00:22:35,452 UTF-8 can't read: train.txt ... using "latin-1" instead.
TaggedCorpus: 222 train + 25 dev + 28 test sentences


In [0]:
tag_type = 'ner'

In [8]:
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

[b'<unk>', b'O', b'PER', b'COMP', b'DATE', b'ORG', b'WITNESS', b'PER|*', b'*', b'DATE|DATE', b'WITNESS\\_1', b'<START>', b'<STOP>']


In [9]:
embedding_types: List[TokenEmbeddings] = [

    WordEmbeddings('glove'),

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
#      FlairEmbeddings('news-forward'),
#      FlairEmbeddings('news-backward'),
]

2019-05-04 00:22:39,809 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpgemupv6i


100%|██████████| 160000128/160000128 [00:10<00:00, 14825006.58B/s]

2019-05-04 00:22:51,340 copying /tmp/tmpgemupv6i to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2019-05-04 00:22:51,855 removing temp file /tmp/tmpgemupv6i
2019-05-04 00:22:52,480 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /tmp/tmpazf1mhl6


100%|██████████| 21494764/21494764 [00:02<00:00, 8837666.72B/s]

2019-05-04 00:22:55,625 copying /tmp/tmpazf1mhl6 to cache at /root/.flair/embeddings/glove.gensim
2019-05-04 00:22:55,658 removing temp file /tmp/tmpazf1mhl6
2019-05-04 00:22:55,660 this function is deprecated, use smart_open.open instead





In [0]:
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [0]:
tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_rnn=True)

In [0]:
trainer : ModelTrainer = ModelTrainer(tagger, corpus)

In [0]:
trainer.train('./project/example/ner', learning_rate=0.3, mini_batch_size=32, max_epochs=100)

In [0]:
plotter = Plotter()
plotter.plot_training_curves('./project/example/ner/loss.tsv')
plotter.plot_weights('./project/example/ner/weights.txt')

**References**

https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md