In [1]:
import torch
import numpy as np
from utils.data import Data
from ner_model import build_model, evaluate
import time
import nltk

%load_ext autoreload
%autoreload 2

In [2]:
def align_data(data):
    ''' align a dictionnary of sequences 
        input:
            data, dict of sequences { 'key1': ['I', 'dream', 'of', 'the', 'Moon'] 
                        'key2': [O, O, O, O, 'B-LOC']}
        output:

            dict of strings {'key1': ['I dream of the Moon'] 
                             'key2': ['O O     O  O   B-LOC']}
    '''
    spacings = [max([len(seq[i]) for seq in data.values()]) for i in range(len(data[list(data.keys())[0]]))]

    data_aligned = dict()

    for key, seq in data.items():
        str_aligned = ""
        for token, spacing in zip(seq, spacings):
            str_aligned += token + " "*(spacing-len(token) + 1)
        data_aligned[key] = str_aligned
    return data_aligned
#print(align_data(align)['inputs'])
#print(align_data(align)['labels'])

In [3]:
path2xpt = '../pretrained/default_cnn.xpt'
path2model = '../pretrained/default_cnn.0.model'
decode_config_dict = {'load_model_dir':path2model # load model file
                     }
data = Data()
#data.read_config(decode_config_dict)
print("NER MODEL: decoding-style loading..")
## dset_dir must only contains dictionnary informations here (dset from the original model should be cleaned with the function clean_dset (to be coded))
data.load_export(path2xpt)
## supplementary configurations (optional, maybe not useful in deployment)
data.read_config(decode_config_dict)
## !!! we should be loading the weights here and not at each prediction!!!!
data.HP_gpu = torch.cuda.is_available()
#data.show_data_summary()
model = build_model(data)

NER MODEL: decoding-style loading..
Load Model weights from file ../pretrained/default_cnn.0.model
building Network..
use crf:  True
use_char:  True
char feature extractor:  CNN
word feature extractor:  LSTM
Build word sequence feature extractor: LSTM...
Build word representation...
Build char sequence feature extractor: CNN..
build CRF...


In [6]:
input_data = 'I am working at the APHP. They have recently refused Google and Facebook cooperation. Camus wrote such beautiful plays'
## Pre-processing from client 
sentences = nltk.sent_tokenize(input_data)
input_client = []
input_model = []
for sent in sentences:
    tokens = nltk.word_tokenize(sent)
    # we have to keep a sequence wo '' sentences separators for the client output
    input_client += tokens
    input_model += tokens + ['']
print(input_client)
print(input_model)

['I', 'am', 'working', 'at', 'the', 'APHP', '.', 'They', 'have', 'recently', 'refused', 'Google', 'and', 'Facebook', 'cooperation', '.', 'Camus', 'wrote', 'such', 'beautiful', 'plays']
['I', 'am', 'working', 'at', 'the', 'APHP', '.', '', 'They', 'have', 'recently', 'refused', 'Google', 'and', 'Facebook', 'cooperation', '.', '', 'Camus', 'wrote', 'such', 'beautiful', 'plays', '']


In [9]:
start_time = time.time()
#print(feed_data)
### self.fix_alphabet() placed inside generate_instance* should prevent the vocabularies to grow indefinitely with fed inputs
data.generate_instance_from_list(input_model)
#print('***************')
#print(data.raw_texts)
#print(evaluate(data, model, 'raw', label_flag=False))
#print('*****************')
speed, acc, p, r, f, pred_results, pred_scores = evaluate(data, model, 'raw', label_flag=False) 

timed = time.time() - start_time
print('Processing time {:.2} s'.format(timed))
print('Decoding speed: {0:.1f} st/s'.format(speed))
print(pred_results)
# reconstruct a unique sequence for the client
output_client = []
for l in pred_results:
    output_client += l

output_aligned = align_data({'raw_input': input_client, 'labels':output_client})
print(output_aligned['raw_input'])
print(output_aligned['labels'])

Processing time 0.015 s
Decoding speed: 206.9 st/s
[['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O'], ['O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O'], ['B-PER', 'O', 'O', 'O', 'O']]
I am working at the APHP  . They have recently refused Google and   Facebook cooperation . Camus wrote such beautiful plays 
O O  O       O  O   B-LOC O O    O    O        O       B-ORG  I-ORG I-ORG    O           O B-PER O     O    O         O     


In [16]:
len(input_seq)

16

In [18]:
speed, acc, p, r, f, pred_results, pred_scores = evaluate(data, model, 'raw', label_flag=False)

In [19]:
print(feed_data)
pred_results

['I', 'am', 'working', 'at', 'the', 'APHP', 'in', 'France', '.', '', 'But', 'sometimes', 'unwillingly', 'for', 'Google', 'and', 'Facebook', 'too', '.', '']


[['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'O'],
 ['O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O']]

In [2]:
in_data = ["I", "love", "you"]
labels = ['B-Per', 'O', 'O']
align = {'inputs':in_data, 'labels':labels}
