In [1]:
from ner.utils import download_untar
import numpy as np
from sklearn.model_selection import train_test_split
from ner.network import NER


from os.path import join
folder = join('..', 'data')

  from ._conv import register_converters as _register_converters


In [2]:
dataset_dict = dict()


with open( join(folder, 'NER_Pers.txt'), encoding='utf-8') as f:
    xy_list = list()
    tokens = list()
    tags = list()
    val = list()
    for line in f:
        items = line.split()
        if len(items) > 1 and '-DOCSTART-' not in items[0]:
            token, tag = items
            if token[0].isdigit():
                tokens.append('#')
            else:
                tokens.append(token)
            tags.append(tag)
        elif len(tokens) > 0:
            xy_list.append((tokens, tags,))
            tokens = list()
            tags = list()


In [3]:
dataset_dict['train'], ost = train_test_split(xy_list,test_size=0.2)

In [5]:
dataset_dict['valid'], dataset_dict['test'] = train_test_split(ost,test_size=0.5)

In [6]:
from ner.corpus import Corpus
corp = Corpus(dataset_dict, embeddings_file_path='lenta_lower_100.bin')

Loading embeddins...


### Neural Network
Now we have to create the Neural Network. To do so we use NER class from the network module. The NER constructor takes the following arguments:

    token_embeddings_dim - token embeddings dimensionality (must agree with embeddings if they are provided)
    char_embeddings_dim - character embeddings dimensionality 
    use_crf - whether to use Conditional Random Fields on the top (suggested to always use True)
    use_capitalization - whethere to include capitalization binary features to the input of the network.
                         If True than binary feature indicating whether the word starts with capital letter
                         will be concatenated to the word embeddings.
    n_filters - list of output feature dimensionality for each layer. For [100, 200] there will be two
                layers with 100 and 200 number of units respectively.
    embeddings_dropout - whether to use dropout on embeddings
    
There are special type of argument determinig what type of net to build:
    
    net_type - could be one of the following 'cnn', 'rnn', and 'cnn_highway'
    
For each net type there are a number of optional arguments. For convolutional neural networks ('cnn' and 'cnn_highway' net types) there are:

    filter_width - width of the convolutional filter (number of tokens under the filter)
    use_batch_norm - if True each layer will be provided with batch normalization

For 'rnn' net there is
    
    cell_type - could be lstm or gru

### Network training
To train the network the following parameters must be specified:

    dropout_rate - probability of dropping the hidden state a value from 0 to 1. 0.5 Works well
                   in most of the cases
    epochs - number of epochs (10 - 100 typical)
    learning_rate: learning rate (0.01 - 0.0001 typical)
    batch_size: number of samples in the batch (4 - 64 typical)
    learning_rate_decay - multiple factor of decreasing learning rate every epoch (1 - 0.5 typical)

In [7]:
net = NER(corp, token_embeddings_dim=100, use_crf=True,char_embeddings_dim=25,
         concat_embeddings=True, use_char_embeddins=True)
learning_params = {'dropout_rate': 0.5,
                   'epochs': 10,
                   'learning_rate': 0.005,
                   'batch_size': 1}

results = net.fit(**learning_params)


Number of parameters: 
ConvNet 185088
Embeddings 3800625
Classifier 1028
transitions:0 16
Total number of parameters equal 3986757


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 0
Eval on valid:
processed 22628 tokens with 1106 phrases; found: 1051 phrases; correct: 994.

precision:  94.58%; recall:  89.87%; FB1:  92.17


Epoch 1
Eval on valid:
processed 22628 tokens with 1106 phrases; found: 1132 phrases; correct: 1048.

precision:  92.58%; recall:  94.76%; FB1:  93.66


Epoch 2
Eval on valid:
processed 22628 tokens with 1106 phrases; found: 1058 phrases; correct: 943.

precision:  89.13%; recall:  85.26%; FB1:  87.15


Epoch 3
Eval on valid:
processed 22628 tokens with 1106 phrases; found: 1161 phrases; correct: 1048.

precision:  90.27%; recall:  94.76%; FB1:  92.46


Epoch 4
Eval on valid:
processed 22628 tokens with 1106 phrases; found: 1037 phrases; correct: 939.

precision:  90.55%; recall:  84.90%; FB1:  87.63


Epoch 5
Eval on valid:
processed 22627 tokens with 1106 phrases; found: 1364 phrases; correct: 974.

precision:  71.41%; recall:  88.07%; FB1:  78.87


Epoch 6
Eval on valid:
processed 22628 tokens with 1106 phrases; found: 937 phrases; c