In [41]:
import time
import os
import torch
import numpy as np
from main import easyTrain

# simple decoding
import sys
sys.path.insert(1, '../')
from serve import get_model_api
from nermodel.utils.functions import build_ann
from nermodel.utils.tokenizer import myTokenizer
# visualization
from spacy import displacy

# detailed decoding
from utils.data import Data
from ner_model import load_model_decode, build_model

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Train a model on CoNLL2003 data (simple version)

This notebook gives the appropriate code to train a SotA NER model on the [conll2003 corpus](https://www.clips.uantwerpen.be/conll2003/ner/) with a neural tagger. This corpus contains 592 documents.

## Define paths to data, saved model, pretrained embeddings

In [46]:
path2train = '../conll2003/train.conll2003'
path2dev = '../conll2003/dev.conll2003'
path2test = '../conll2003/test.conll2003'
path2model = '../pretrained/myModel/myModel'
modelDir = os.path.join(*path2model.split('/')[:-1])
if not os.path.isdir(modelDir):
    os.mkdir(modelDir)
#path2emb = '../pretrained/glove.6B.50d.txt' # define a path to embedding here 
path2emb = None

## Configuration file

In [47]:
confdict = {# IO
            'train_dir':path2train,
            'dev_dir':path2dev,
            'test_dir':path2test,
            'model_dir':path2model,
            'MAX_SENTENCE_LENTGH':2500,
            'number_normalized':False,
            # Embeddings
            'word_emb_dir':path2emb, # path 2 pretrained word embeddings (default is none, it )
            'char_emb_dir':None, # if you want to use pretrained char embeddings
            'word_emb_dim':200, # word embeddings dimension
            'char_emb_dim':30, # char embedding dimension
            # Network
            'use_crf':True, # use a crf layer at the end of the network
            'use_char':True, # use characater embedings
            'use_feats': False, # do not use aditionnal features
            'word_feature_extractor':'LSTM', # choose CNN/LSTM/GRU
            'char_feature_extractor':'LSTM', # choose CNN/LSTM/GRU
            # HP
            'HP_cnn_layer':4 ,
            'HP_char_hidden_dim':50,
            'HP_hidden_dim':200,
            'HP_dropout':0.5, 
            'HP_lstm_layer':1,
            'HP_bilstm':True,
            'HP_lr':0.007,
            # training
            'optimizer':'SGD',
            'batch_size':10,
            'iteration':5 # number of iteration, 30 is a good choice, you can go up to 50 (more is not very useful)
               }

## Training 

In [4]:
fscore = easyTrain(confdict)

Model Train
Training model...
****************************************
----------Data summary:----------

 HP_gpu: False
 MAX_SENTENCE_LENTGH: 1000
 number_normalized: False
 word_alphabet: 30292
 char_alphabet_size: 87
 label_alphabet_size: 10
 load_model_dir: None


I/O:
 tagScheme: BIO
 train_dir: ../conll2003/train.conll2003
 dev_dir: ../conll2003/dev.conll2003
 test_dir: ../conll2003/test.conll2003
 raw_dir: None
 elmo_opt: None
 elmo_weights: None
 dset_dir: None
 word_emb_dir: None
 char_emb_dir: None
 feature_emb_dirs: []


Network:
 word_feature_extractor: LSTM
 use_char: True
 char_feature_extractor: LSTM
 use_crf: True
 use_elmo: False


Network Hyperparameters:
 word_emb_dim: 200
 char_emb_dim: 30
 feature_emb_dims: []
 HP_char_hidden_dim: 50
 HP_hidden_dim: 200
 HP_lstm_layer: 1
 HP_bilstm: True
 HP_cnn_layer: 4
 HP_dropout: 0.5


Training Hyperparameters:
 average_batch_loss: False
 optimizer: SGD
 iteration: 5
 batch_size: 10
 HP_lr: 0.007
 HP_lr_decayr: 0.05
 HP_clip: N

KeyboardInterrupt: 

# Decode a new file with a trained model (simple version)

#### Build customized decoder and load raw text

In [13]:
tokenizer = myTokenizer('fr')

In [14]:
path2text = '../decoding/ins/topology.txt'
with open(path2text, 'r') as f:
    text = f.read()

**Load the model**: You have to specify the architecture (model options) and the weights of the trained models as well as the path to put the decoded data.

In [15]:
path2xpt = '../pretrained/myModel/myModel.xpt'
path2model = '../pretrained/myModel/myModel.0.model'
model = get_model_api(path2xpt=path2xpt, path2model=path2model)

************************************************************
NER MODEL: loading model, decoding-style ...
****************************************
----------Data summary:----------

 HP_gpu: False
 MAX_SENTENCE_LENTGH: 1000
 number_normalized: False
 word_alphabet: 30292
 char_alphabet_size: 87
 label_alphabet_size: 10
 load_model_dir: ../pretrained/myModel/myModel.0.model


Network:
 word_feature_extractor: LSTM
 use_char: True
 char_feature_extractor: LSTM
 use_crf: True
 use_elmo: False


Network Hyperparameters:
 word_emb_dim: 200
 char_emb_dim: 30
 feature_emb_dims: []
 HP_char_hidden_dim: 50
 HP_hidden_dim: 200
 HP_lstm_layer: 1
 HP_bilstm: True
 HP_cnn_layer: 4
 HP_dropout: 0.5
****************************************

Load Model weights from file ../pretrained/myModel/myModel.0.model
building Network..
use crf:  True
use_char:  True
char feature extractor:  LSTM
word feature extractor:  LSTM
Build word sequence feature extractor: LSTM...
Build word representation...
build char 

## Decode

In [16]:
text, annotations = model(input_data=text, live = False, tokenizer = tokenizer)

Processing time 0.57 s
Decoding speed: 270.72 st/s


Get spans and a visualizable form of entities

In [17]:
annotations, visu_ents = build_ann(tokenizer.tokenize(text), annotations, visu = True, form = 'min')

See first annotations:

In [19]:
annotations[:10]

[{'type': 'PER', 'begin': 35, 'end': 46},
 {'type': 'PER', 'begin': 658, 'end': 675},
 {'type': 'MISC', 'begin': 733, 'end': 738},
 {'type': 'PER', 'begin': 739, 'end': 744},
 {'type': 'ORG', 'begin': 790, 'end': 795},
 {'type': 'PER', 'begin': 796, 'end': 801},
 {'type': 'PER', 'begin': 833, 'end': 848},
 {'type': 'PER', 'begin': 850, 'end': 863},
 {'type': 'PER', 'begin': 867, 'end': 885},
 {'type': 'PER', 'begin': 890, 'end': 908}]

Visualize the text and annotations

In [22]:
displacy.render([{'text':text, 'ents':visu_ents, 'title':None}], style='ent', jupyter=True, manual=True)

# Detailed decoding (measure performances)
If you have test data that you want to use to evaluate the model, you can use the following code to perform this evaluation. Note that your data should be on the same format that the one used to train the model (conll-like formats)

In [68]:
path2xpt = '../pretrained/myModel/myModel.xpt'
path2model = '../pretrained/myModel/myModel.0.model'
path2test_data = '../conll2003/test.bio'
path2decode_dir = None
decodedict = {'xpt_dir':path2xpt, # load the model options
              'load_model_dir':path2model, # load model weights
              'raw_dir':path2test_data, # path to data to be decoded
              'decode_dir':path2decode_dir,
              'number_normalized': False,
              'MAX_SENTENCE_LENGTH': 2500
             }

In [69]:
data = Data()
data.read_config(decodedict)
data.load_model_dir
data.load_export(data.xpt_dir)
data.read_config(decodedict)
data.HP_gpu = torch.cuda.is_available()
data.show_data_summary()
data.generate_instance('raw')

decode_results, _ = load_model_decode(data, 'raw')
if data.decode_dir:
    data.write_decoded_results(decode_results, 'raw')