## Restore the neural network from save_module

In [2]:
import numpy as np
import tensorflow as tf
from deeppavlov.core.data.simple_vocab import SimpleVocabulary
from deeppavlov.dataset_readers.conll2003_reader import Conll2003DatasetReader
from deeppavlov.metrics.fmeasure import precision_recall_f1
# The function precision_recall_f1 takes two lists: y_true and y_predicted
# the tag sequences for each sentences should be merged into one big list
from deeppavlov.core.data.utils import zero_pad
# zero_pad takes a batch of lists of token indices, pad it with zeros to the
# maximal length and convert it to numpy matrix
from itertools import chain
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator
from deeppavlov.models.preprocessors.mask import Mask

def get_embeddings(indices, vocabulary_size, emb_dim):
    # Initialize the random gaussian matrix with dimensions [vocabulary_size, embedding_dimension]
    # The **VARIANCE** of the random samples must be 1 / embedding_dimension
    emb_mat = np.random.randn(vocabulary_size, emb_dim).astype(np.float32) / np.sqrt(emb_dim)
    # YOUR CODE HERE
    emb_mat = tf.Variable(emb_mat, name='Embeddings', trainable=True)
    emb = tf.nn.embedding_lookup(emb_mat, indices)
    return emb


def conv_net(units, n_hidden_list, cnn_filter_width, activation=tf.nn.relu):
    # Use activation(units) to apply activation to units
    for n_hidden in n_hidden_list:
        units = tf.layers.conv1d(units,
                                 n_hidden,
                                 cnn_filter_width,
                                 padding='same')
        units = activation(units)
    return units


def masked_cross_entropy(logits, label_indices, number_of_tags, mask):
    ground_truth_labels = tf.one_hot(label_indices, depth=number_of_tags)
    loss_tensor = tf.nn.softmax_cross_entropy_with_logits_v2(labels=ground_truth_labels, logits=logits)
    loss_tensor *= mask
    loss = tf.reduce_mean(loss_tensor)
    return loss


def eval_valid(network, batch_generator):
    total_true = []
    total_pred = []
    for x, y_true in batch_generator:
        # Prepare token indices from tokens batch
        x_inds = token_vocab(x)  # YOUR CODE HERE

        # Pad the indices batch with zeros
        x_batch = zero_pad(x_inds)  # YOUR CODE HERE

        # Get the mask using get_mask
        mask = get_mask(x)  # YOUR CODE HERE

        # We call the instance of the NerNetwork because we have defined __call__ method
        y_inds = network(x_batch, mask)

        # For every sentence in the batch extract all tags up to paddings
        y_inds = [y_inds[n][:len(x[n])] for n, y in enumerate(y_inds)]  # YOUR CODE HERE
        y_pred = tag_vocab(y_inds)

        # Add fresh predictions
        total_true.extend(chain(*y_true))
        total_pred.extend(chain(*y_pred))
    res = precision_recall_f1(total_true, total_pred, print_results=True)


class NerNetwork:
    def __init__(self,
                 n_tokens,
                 n_tags,
                 token_emb_dim=100,
                 n_hidden_list=(128,),
                 cnn_filter_width=7,
                 use_batch_norm=False,
                 embeddings_dropout=False,
                 top_dropout=False,
                 **kwargs):
        # ================ Building inputs =================

        self.learning_rate_ph = tf.placeholder(tf.float32, [])
        self.dropout_keep_ph = tf.placeholder(tf.float32, [])
        self.token_ph = tf.placeholder(tf.int32, [None, None], name='token_ind_ph')
        self.mask_ph = tf.placeholder(tf.float32, [None, None], name='Mask_ph')
        self.y_ph = tf.placeholder(tf.int32, [None, None], name='y_ph')

        # ================== Building the network ==================

        # Now embedd the indices of tokens using token_emb_dim function

        ######################################
        ########## YOUR CODE HERE ############
        emb = get_embeddings(self.token_ph, n_tokens, token_emb_dim)
        ######################################

        emb = tf.nn.dropout(emb, self.dropout_keep_ph, (tf.shape(emb)[0], 1, tf.shape(emb)[2]))

        # Build a multilayer CNN on top of the embeddings.
        # The number of units in the each layer must match
        # corresponding number from n_hidden_list.
        # Use ReLU activation
        ######################################
        ########## YOUR CODE HERE ############
        units = conv_net(emb, n_hidden_list, cnn_filter_width)
        ######################################
        units = tf.nn.dropout(units, self.dropout_keep_ph, (tf.shape(units)[0], 1, tf.shape(units)[2]))
        logits = tf.layers.dense(units, n_tags, activation=None)
        self.predictions = tf.argmax(logits, 2)

        # ================= Loss and train ops =================
        # Use cross-entropy loss. check the tf.nn.softmax_cross_entropy_with_logits_v2 function
        ######################################
        ########## YOUR CODE HERE ############
        self.loss = masked_cross_entropy(logits, self.y_ph, n_tags, self.mask_ph)
        ######################################

        # Create a training operation to update the network parameters.
        # We purpose to use the Adam optimizer as it work fine for the
        # most of the cases. Check tf.train to find an implementation.
        # Put the train operation to the attribute self.train_op

        ######################################
        ########## YOUR CODE HERE ############
        optimizer = tf.train.AdamOptimizer(self.learning_rate_ph)
        self.train_op = optimizer.minimize(self.loss)
        ######################################

        # ================= Initialize the session =================

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()


    def __call__(self, tok_batch, mask_batch):
        feed_dict = {self.token_ph: tok_batch,
                     self.mask_ph: mask_batch,
                     self.dropout_keep_ph: 1.0}
        return self.sess.run(self.predictions, feed_dict)

    def train_on_batch(self, tok_batch, tag_batch, mask_batch, dropout_keep_prob, learning_rate):
        feed_dict = {self.token_ph: tok_batch,
                     self.y_ph: tag_batch,
                     self.mask_ph: mask_batch,
                     self.dropout_keep_ph: dropout_keep_prob,
                     self.learning_rate_ph: learning_rate}
        self.sess.run(self.train_op, feed_dict)
        # Now, save the graph

dataset = Conll2003DatasetReader().read('data')
get_mask = Mask()
data_iterator = DataLearningIterator(dataset)

special_tokens = ['<UNK>']
token_vocab = SimpleVocabulary(special_tokens, save_path='model/token.dict')
tag_vocab = SimpleVocabulary(save_path='model/tag.dict')

all_tokens_by_sentences = [tokens for tokens, tags in dataset['train']]
all_tags_by_sentences = [tags for tokens, tags in dataset['train']]

token_vocab.fit(all_tokens_by_sentences)
tag_vocab.fit(all_tags_by_sentences)

nernet = NerNetwork(len(token_vocab),
                            len(tag_vocab),
                            n_hidden_list=[100, 100])
nernet.saver.restore(nernet.sess,'save_module')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\stron\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stron\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     C:\Users\stron\AppData\Roaming\nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     C:\Users\stron\AppData\Roaming\nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


INFO:tensorflow:Restoring parameters from save_module


2018-11-30 22:09:02.753 INFO in 'tensorflow'['tf_logging'] at line 115: Restoring parameters from save_module


## Estimate the  loss of  neural  network on 1648 documents

In [5]:
eval_valid(nernet, data_iterator.gen_batches(16483//10, 'test'))

2018-11-30 22:46:05.812 DEBUG in 'deeppavlov.metrics.fmeasure'['fmeasure'] at line 286: processed 290143 tokens with 5136 phrases; found: 5224 phrases; correct: 4928.

precision:  94.33%; recall:  95.95%; FB1:  95.14

	REF: precision:  94.33%; recall:  95.95%; F1:  95.14 5224




In [11]:
import re
import time
from file_parser import loadData,repl


def estimateTime(numDocuments = 1000):
    data = loadData()[0:numDocuments]
    wholeTime = 0
    for k in range(numDocuments):
        with open('Decision_files/' + ('_').join(data[k]["doc_id_from"].split('/')) + '.txt',
                      encoding='utf-8') as file:
            for line in file:
                start = time.time()
                unsplittedLine = re.sub(r'[^a-яА-ЯёЁ0-9]', repl, line)
                x = [unsplittedLine.split()]
                x_inds = token_vocab(x)
                x_batch = zero_pad(x_inds)
                mask = get_mask(x)
                y_inds = nernet(x_batch, mask)
                end = time.time()
                wholeTime = wholeTime + (end - start)
    wholeTime = wholeTime / numDocuments
    print("The average time per one document : " ,wholeTime, " seconds.")
    
estimateTime()


loadData started
The average time per one document :  0.03149499320983887  seconds.
