In [1]:
import torch
import numpy as np
from utils.data import Data
from ner_model import train, data_initialization, build_model, evaluate
import time
import nltk
import os

%load_ext autoreload
%autoreload 2

# How to train a model on conll2003 shared task training set

In [5]:
path2train = '../conll2003/train.conll2003'
path2dev = '../conll2003/dev.conll2003'
path2test = '../conll2003/test.conll2003'
path2model = '../pretrained/myModel/myModel'
modelDir = os.path.join(*path2model.split('/')[:-1])
if not os.path.isdir(modelDir):
    os.mkdir(modelDir)
path2emb = '../pretrained/glove.6B.50d.txt'
confdict = {# IO
            'train_dir':path2train,
            'dev_dir':path2dev,
            'test_dir':path2test,
            'model_dir':path2model,
            # Embeddings
            'word_emb_dir':path2emb,
            'char_emb_dir':None,
            'word_emb_dim':50,
            'char_emb_dim':30,
            # Network
            'use_crf':True,
            'use_char':True,
            'use_feats': False,
            'word_feature_extractor':'LSTM', # choose CNN/LSTM/GRU
            'char_feature_extractor':'LSTM', # choose CNN/LSTM/GRU
            # HP
            'HP_cnn_layer':4 ,
            'HP_char_hidden_dim':50,
            'HP_hidden_dim':200,
            'HP_dropout':0.5,
            'HP_lstm_layer':1,
            'HP_bilstm':True,
            'HP_lr':0.015,
            # training
            'optimizer':'SGD',
            'batch_size':10,
            'iteration':5
               }

In [6]:
# initialization of data object and training (equivalent to main.myTrain(confdict))

data = Data()
data.read_config(confdict)
data.HP_gpu = torch.cuda.is_available()
data_initialization(data)
data.generate_instance('train')
data.generate_instance('dev')
data.generate_instance('test')
data.build_pretrain_emb()

Load pretrained word embedding, norm False, dir: ../pretrained/glove.6B.50d.txt
Embedding: 
 pretrain words: 400000, perfect_match: 14618, case_match: 11722, oov: 3951


In [7]:
train(data)

Training model...
****************************************
----------Data summary:----------

 HP_gpu: False
 MAX_SENTENCE_LENTGH: 1000
 number_normalized: False
 word_alphabet: 30292
 char_alphabet_size: 87
 label_alphabet_size: 10
 load_model_dir: None


I/O:
 tagScheme: BIO
 train_dir: ../conll2003/train.conll2003
 dev_dir: ../conll2003/dev.conll2003
 test_dir: ../conll2003/test.conll2003
 raw_dir: None
 dset_dir: None
 word_emb_dir: ../pretrained/glove.6B.50d.txt
 char_emb_dir: None
 feature_emb_dirs: []


Network:
 word_feature_extractor: LSTM
 use_char: True
 char_feature_extractor: LSTM
 use_crf: True


Network Hyperparameters:
 word_emb_dim: 50
 char_emb_dim: 30
 feature_emb_dims: []
 HP_char_hidden_dim: 50
 HP_hidden_dim: 200
 HP_lstm_layer: 1
 HP_bilstm: True
 HP_cnn_layer: 4
 HP_dropout: 0.5


Training Hyperparameters:
 average_batch_loss: False
 optimizer: SGD
 iteration: 5
 batch_size: 10
 HP_lr: 0.015
 HP_lr_decayr: 0.05
 HP_clip: None
 HP_momentum: 0
 HP_l2: 1e-08
******

0.9160898193483031

# How to decode a new input from a pretrained model

In [6]:
path2xpt = '../pretrained/myModel/myModel.xpt'
path2model = '../pretrained/myModel/myModel.1.model'
decode_config_dict = {'load_model_dir':path2model # load model file
                     }
data = Data()
## dset_dir must only contains dictionnary informations here (dset from the original model should be cleaned with the function clean_dset (to be coded))
data.load_export(path2xpt)
## supplementary configurations (optional, maybe not useful in deployment)
data.load_model_dir = path2model
## !!! we should be loading the weights here and not at each prediction!!!!
data.HP_gpu = torch.cuda.is_available()
#data.show_data_summary()
model = build_model(data)

Load Model weights from file ../pretrained/myModel/myModel.1.model
building Network..
use crf:  True
use_char:  True
char feature extractor:  CNN
word feature extractor:  LSTM
Build word sequence feature extractor: LSTM...
Build word representation...
Build char sequence feature extractor: CNN..
build CRF...


In [7]:
file_name = '../prod_data/wiki_en_france.txt'
out_folder = 'proprecessed/'
if not os.path.isdir(out_folder):
    os.mkdir(out_folder)
path2write = out_folder + os.path.basename(os.path.splitext(file_name)[0]) + '.out'
# open and return the text of the file
with open(file_name, 'r') as f:
    input_data = f.read()

In [8]:
#input_data = 'I am working at the APHP. They have recently refused Google and Facebook cooperation. Camus wrote such beautiful plays'
## Pre-processing from client 
sentences = nltk.sent_tokenize(input_data)
input_client = []
input_model = []
for sent in sentences:
    tokens = nltk.word_tokenize(sent)
    # we have to keep a sequence wo '' sentences separators for the client output
    input_client += tokens
    input_model += tokens + ['']
#print(input_client)
#print(input_model)

In [9]:
start_time = time.time()
#print(feed_data)
### self.fix_alphabet() placed inside generate_instance* should prevent the vocabularies to grow indefinitely with fed inputs
data.generate_instance_from_list(input_model)
#print('***************')
#print(data.raw_texts)
#print(evaluate(data, model, 'raw', label_flag=False))
#print('*****************')
speed, acc, p, r, f, pred_results, pred_scores = evaluate(data, model, 'raw', label_flag=False) 

timed = time.time() - start_time
print('Processing time {:.2} s'.format(timed))
print('Decoding speed: {0:.1f} st/s'.format(speed))
print(pred_results)
# reconstruct a unique sequence for the client
#output_client = []
#for l in pred_results:
#    output_client += l

#output_aligned = align_data({'raw_input': input_client, 'labels':output_client})
#print(output_aligned['raw_input'])
#print(output_aligned['labels'])
out = [' '.join(sent) +'\n' for sent in pred_results]
print(out)
with open(path2write, 'w') as f:
    f.writelines(out)
    

Processing time 0.091 s
Decoding speed: 319.2 st/s
[['B-ORG', 'O', 'B-PER', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'B-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'B-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O'], ['B-ORG', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O'], ['B-ORG', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'O', 'B-PER', 'I-PER', 'O'], ['B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-PER', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-PER', 'O', 'B-PE

# Load training informations of model