In [1]:
import torch
import numpy as np
from utils.data import Data
from ner_model import build_model, evaluate
import time
import nltk
import os


%load_ext autoreload
%autoreload 2

In [26]:
def align_data(data):
    ''' align a dictionnary of sequences 
        input:
            data, dict of sequences { 'key1': ['I', 'dream', 'of', 'the', 'Moon'] 
                        'key2': [O, O, O, O, 'B-LOC']}
        output:

            dict of strings {'key1': ['I dream of the Moon'] 
                             'key2': ['O O     O  O   B-LOC']}
    '''
    spacings = [max([len(seq[i]) for seq in data.values()]) for i in range(len(data[list(data.keys())[0]]))]

    data_aligned = dict()

    for key, seq in data.items():
        str_aligned = ""
        for token, spacing in zip(seq, spacings):
            str_aligned += token + " "*(spacing-len(token) + 1)
        data_aligned[key] = str_aligned
    return data_aligned
#print(align_data(align)['inputs'])
#print(align_data(align)['labels'])

In [8]:
path2xpt = '../pretrained/baseline.xpt'
path2model = '../pretrained/baseline.model'
decode_config_dict = {'load_model_dir':path2model # load model file
                     }
data = Data()
#data.read_config(decode_config_dict)
print("NER MODEL: decoding-style loading..")
## dset_dir must only contains dictionnary informations here (dset from the original model should be cleaned with the function clean_dset (to be coded))
data.load_export(path2xpt)
## supplementary configurations (optional, maybe not useful in deployment)
data.read_config(decode_config_dict)
## !!! we should be loading the weights here and not at each prediction!!!!
data.HP_gpu = torch.cuda.is_available()
#data.show_data_summary()
model = build_model(data)

NER MODEL: decoding-style loading..
Load Model weights from file ../pretrained/baseline.model
building Network..
use crf:  True
use_char:  True
char feature extractor:  CNN
word feature extractor:  LSTM
Build word sequence feature extractor: LSTM...
Build word representation...
Build char sequence feature extractor: CNN..
build CRF...


In [12]:
file_name = '../prod_data/wiki_en_france.txt'
out_folder = 'proprecessed/'
if not os.path.isdir(out_folder):
    os.mkdir(out_folder)
path2write = out_folder + os.path.basename(os.path.splitext(file_name)[0]) + '.out'
# open and return the text of the file
with open(file_name, 'r') as f:
    input_data = f.read()

In [13]:
#input_data = 'I am working at the APHP. They have recently refused Google and Facebook cooperation. Camus wrote such beautiful plays'
## Pre-processing from client 
sentences = nltk.sent_tokenize(input_data)
input_client = []
input_model = []
for sent in sentences:
    tokens = nltk.word_tokenize(sent)
    # we have to keep a sequence wo '' sentences separators for the client output
    input_client += tokens
    input_model += tokens + ['']
#print(input_client)
#print(input_model)

In [21]:
start_time = time.time()
#print(feed_data)
### self.fix_alphabet() placed inside generate_instance* should prevent the vocabularies to grow indefinitely with fed inputs
data.generate_instance_from_list(input_model)
#print('***************')
#print(data.raw_texts)
#print(evaluate(data, model, 'raw', label_flag=False))
#print('*****************')
speed, acc, p, r, f, pred_results, pred_scores = evaluate(data, model, 'raw', label_flag=False) 

timed = time.time() - start_time
print('Processing time {:.2} s'.format(timed))
print('Decoding speed: {0:.1f} st/s'.format(speed))
print(pred_results)
# reconstruct a unique sequence for the client
#output_client = []
#for l in pred_results:
#    output_client += l

#output_aligned = align_data({'raw_input': input_client, 'labels':output_client})
#print(output_aligned['raw_input'])
#print(output_aligned['labels'])
out = [' '.join(sent) +'\n' for sent in pred_results]
print(out)
with open(path2write, 'w') as f:
    f.writelines(out)
    

Processing time 0.042 s
Decoding speed: 747.7 st/s
[['B-LOC', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-LOC', 'I-LOC', 'O'], ['O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'O', 'B-MISC', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

## Decoding tests

In [5]:
path2gold = '../conll2003/test.conll2003'
path2decode = '../pretrained/myModel/myModel.out'
path2dset = '../pretrained/myModel/myModel.dset'
path2model = '../pretrained/myModel/myModel.0.model'

conf_dict = {'raw_dir':path2gold,
                   'decode_dir':path2decode,
                    'dset_dir':path2dset,
                    'load_model_dir':path2model,
                    'number_normalized': True
                   }

In [6]:
from utils.data import Data
from ner_model import load_model_decode

data = Data()
data.read_config(conf_dict)
data.load(data.dset_dir)
data.read_config(conf_dict)
data.HP_gpu = torch.cuda.is_available()
data.generate_instance('raw')
print(len(data.raw_texts))
print(len(data.test_texts))

3683
3683


In [7]:
decode_results, _ = load_model_decode(data, 'raw', label_flag=True)

Load Model from file ../pretrained/myModel/myModel
building Network..
use crf:  True
use_char:  True
char feature extractor:  CNN
word feature extractor:  LSTM
Build word sequence feature extractor: LSTM...
Build word representation...
Build char sequence feature extractor: CNN..
build CRF...
gold_num = 5596; predict_num = 5696; right_num = 4683
raw: time10.67s, speed: 349.10st/s; acc: 0.9656, p: 0.8222, r: 0.8368, f: 0.8294


In [8]:
print(len(decode_results))

3683


10

In [11]:
data.write_decoded_results(decode_results, 'raw')

Predict raw result has been written into file ../pretrained/myModel/myModel.out
