Train a network for Named Entity Recognition with architecture mentioned in [].

How to run:
1. Specify the data_dir. The directory should contain the train, val, and test folders, along with the 'feats' folder obtained through feat.ipynb.
2. specify the model directory. The directory needs to be created manually. It should contain a params.json.
3. The description of the progNet specific parameters is given below in the respective cells.


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import sys
root_path = '../../../'
sys.path.append(root_path)

In [4]:
import argparse
import logging
import os
import numpy as np
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from tqdm import trange
from src.ner import utils
from src.booster.progressive_data_loader import DataLoader
from src.booster.progressive_evaluate import evaluate
from src.booster.progressive_encoder import CharEncoder, EntityEncoder, WordEncoder
from src.booster.progNN.net import LSTMCRF

In [5]:
data_dir = os.path.join(root_path, 'src/ner/data/bc5cdr_iobes_id')
model_dir = os.path.join(root_path, 'src/ner/experiments/test/test_prog3col')

In [6]:
"""
freeze_prev: whether to freeze the previous column
best_prev: whether to load the best checkpoint of the previous column, or a randomly initialized network. This is to see whether there is some benefit from the PNN structure, or just through more capacity.
linear_adapter: keep it to True mostly. False places a non-linear adapter. Check ProgNet paper for more details.

The parameters below are optimal.
"""

freeze_prev = True
best_prev = True
linear_adapter = True

In [8]:
"""
c1_model_dir: model directory for the first column
c2_model_dir: model directory for the second column
"""

# 0. pretrained model dir
c1_model_dir = os.path.join(root_path, 'src/ner/experiments/disease/st/st_ncbi')
c2_model_dir = os.path.join(root_path, 'src/ner/experiments/disease/st/st_jnlpba')

In [9]:
# 1. set the device to train on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
# 2. Load the parameters from json file
c1_network_params = os.path.join(c1_model_dir, 'params.json') # these should be loaded from the pretrained model
assert os.path.isfile(c1_network_params), "No json configuration file found at {}".format(c1_network_params)

c2_network_params = os.path.join(c2_model_dir, 'params.json') # these should be loaded from the pretrained model
assert os.path.isfile(c2_network_params), "No json configuration file found at {}".format(c2_network_params)

new_network_params = os.path.join(model_dir, 'params.json')
assert os.path.isfile(new_network_params), "No json configuration file found at {}".format(new_network_params)

c1_params = utils.Params(c1_network_params)
c2_params = utils.Params(c2_network_params)
new_params = utils.Params(new_network_params)

# use GPU if available
c1_params.cuda = torch.cuda.is_available()
c2_params.cuda = torch.cuda.is_available()
new_params.cuda = torch.cuda.is_available()

# 3. Set the random seed for reproducible experiments
torch.manual_seed(230)
if new_params.cuda: torch.cuda.manual_seed(230)
np.random.seed(0)

In [15]:
# 4. Set the logger
utils.set_logger(os.path.join(model_dir, 'train.log'))

In [16]:
# 5. Create the input data pipeline
logging.info("Loading the datasets...")
# 5.1 specify features
from collections import OrderedDict

# 5.1.1 encoders for the new model
logging.info('creating and loading data loaders')
data_encoder = OrderedDict()
data_encoder[CharEncoder.FEATURE_NAME] = CharEncoder(os.path.join(data_dir, 'feats'))
data_encoder[WordEncoder.FEATURE_NAME] = WordEncoder(os.path.join(data_dir, 'feats'), dim=new_params.embedding_dim)
label_encoder = OrderedDict()
label_encoder[EntityEncoder.FEATURE_NAME] = EntityEncoder(os.path.join(data_dir, 'feats'))
new_params.data_feats = []
new_params.label_feats = []
for feat in data_encoder:
    new_params.data_feats.append(feat)
for feat in label_encoder:
    new_params.label_feats.append(feat)

# 5.1.2 encoders for the 1st model
c1_data_encoder = utils.load_obj(os.path.join(c1_model_dir, 'data_encoder.pkl'))
c1_label_encoder = utils.load_obj(os.path.join(c1_model_dir, 'label_encoder.pkl'))
c1_params.data_feats = []
c1_params.label_feats = []
for feat in c1_data_encoder:
    c1_params.data_feats.append(feat)
for feat in c1_label_encoder:
    c1_params.label_feats.append(feat)

# 5.1.3 encoders for the 2nd model
c2_data_encoder = utils.load_obj(os.path.join(c2_model_dir, 'data_encoder.pkl'))
c2_label_encoder = utils.load_obj(os.path.join(c2_model_dir, 'label_encoder.pkl'))
c2_params.data_feats = []
c2_params.label_feats = []
for feat in c2_data_encoder:
    c2_params.data_feats.append(feat)
for feat in c2_label_encoder:
    c2_params.label_feats.append(feat)

# 5.2 load data
# 5.2.1 data loader for the new model
data_loader = DataLoader(new_params,
                         data_dir,
                         data_encoder,
                         label_encoder,
                         device=device)
c1_data_loader = DataLoader(c1_params,
                            data_dir,# data_dir has to be the new data
                            c1_data_encoder,
                            device=device)

c2_data_loader = DataLoader(c2_params,
                            data_dir,  # data_dir has to be the new data
                            c2_data_encoder,
                            device=device)

Loading the datasets...
creating and loading data loaders


In [17]:
# 6. Modeling
logging.info('freeze_prev: {}'.format(str(freeze_prev)))
logging.info('best_prev: {}'.format(str(best_prev)))
# 6.1.1 Define the model architecture for 1st and 2nd column
logging.info('loading previous models and creating new model')
c1_model = LSTMCRF(params=c1_params,
                   char_vocab_length=c1_data_encoder[CharEncoder.FEATURE_NAME].vocab_length,
                   num_tags=c1_label_encoder[EntityEncoder.FEATURE_NAME].num_tags,
                   pretrained_word_vecs=torch.from_numpy(c1_data_encoder[WordEncoder.FEATURE_NAME].vectors),
                   dropout=c1_params.dropout,
                   decoder_type=c1_params.decoder,
                   bidirectional=c1_params.rnn_bidirectional,
                   freeze_embeddings=c1_params.freeze_wordembeddings).to(device).float()

c2_model = LSTMCRF(params=c2_params,
                   char_vocab_length=c2_data_encoder[CharEncoder.FEATURE_NAME].vocab_length,
                   num_tags=c2_label_encoder[EntityEncoder.FEATURE_NAME].num_tags,
                   pretrained_word_vecs=torch.from_numpy(c2_data_encoder[WordEncoder.FEATURE_NAME].vectors),
                   dropout=c2_params.dropout,
                   decoder_type=c2_params.decoder,
                   bidirectional=c2_params.rnn_bidirectional,
                   freeze_embeddings=c2_params.freeze_wordembeddings).to(device).float()
# 6.1.1.1. load the pre-trained model to fit the architecture
if best_prev:
    utils.load_checkpoint(os.path.join(c1_model_dir, 'best.pth'), c1_model)
    utils.load_checkpoint(os.path.join(c2_model_dir, 'best.pth'), c2_model)

# 6.1.2 Define the model for target column
c3_model = LSTMCRF(params=new_params,
                   char_vocab_length=data_encoder[CharEncoder.FEATURE_NAME].vocab_length,
                   num_tags=label_encoder[EntityEncoder.FEATURE_NAME].num_tags,
                   pretrained_word_vecs=torch.from_numpy(data_encoder[WordEncoder.FEATURE_NAME].vectors),
                   dropout=new_params.dropout,
                   decoder_type=new_params.decoder,
                   bidirectional=new_params.rnn_bidirectional,
                   freeze_embeddings=new_params.freeze_wordembeddings).to(device).float()

freeze_prev: True
best_prev: True
loading previous models and creating new model
  "num_layers={}".format(dropout, num_layers))


In [18]:
# 7. Convert to columns
"""
the columns must specify the name of the layers to be used for PNN transfer, in the order from input to output.
The numbers in the bracket represent the input and output dimension of the layer respectively.
"""
logging.info('creating columns')
from src.booster.progNN.column import Column
column_1 = Column(model=c1_model,
                  layers={'rnn_1': (130, 200),
                          'rnn_2': (200, 200),
                          'fc':  (200, c1_label_encoder[EntityEncoder.FEATURE_NAME].num_tags)},
                  data_loader=c1_data_loader).to(device)

column_2 = Column(model=c2_model,
                  layers={'rnn_1': (130, 200),
                          'rnn_2': (200, 200),
                          'fc':  (200, c2_label_encoder[EntityEncoder.FEATURE_NAME].num_tags)},
                  data_loader=c2_data_loader).to(device)

column_3 = Column(model=c3_model,
                  layers={'rnn_1': (130, 200),
                          'rnn_2': (200, 200),
                          'fc':  (200, label_encoder[EntityEncoder.FEATURE_NAME].num_tags)},
                  data_loader=data_loader).to(device)

creating columns


In [19]:
from src.booster.progNN.adapter import Adapter
adapter = Adapter(prev_columns=[column_1, column_2], target_column=column_3, linear=linear_adapter).to(device)

# 8. create progressive net
logging.info('creating progressive net')
from src.booster.progNN.prognet import ProgressiveNet

# 8.1. load the progNet
progNet = ProgressiveNet(prev_columns=[column_1, column_2],
                         target_column=column_3,
                         adapter=adapter,
                         freeze_prev=freeze_prev).to(device).float()

progNet_total_params = sum(p.numel() for p in progNet.parameters())
progNet_total_trainable_params = sum(p.numel() for p in filter(lambda p: p.requires_grad,
                                    progNet.parameters()))
logging.info('total params: {}'.format(str(progNet_total_params)))
logging.info('total trainable params: {}'.format(str(progNet_total_trainable_params)))

# 8.2. define the metrics
from src.ner.evaluation import accuracy_score, f1_score, precision_score, recall_score
metrics = {'accuracy': accuracy_score,
           'f1_score': f1_score,  # micro F1 score
           'precision_score': precision_score,
           'recall_score': recall_score}

# 8.3. Define the optimizer
optimizer = optim.SGD(params=filter(lambda p: p.requires_grad,
                                    progNet.parameters()),
                      lr=0.015,
                      momentum=0.9)

linear adapter: True
creating progressive net
total params: 121391698
total trainable params: 519090


In [20]:
# 9. Train ProgNet
from src.booster.progressive_ner_3col import train_and_evaluate
logging.info('Training progressive net')
train_and_evaluate(progNet,
                   progNet.target_column.data_loader,
                   progNet.target_column.data['train'],
                   progNet.target_column.data['val'],
                   progNet.target_column.data['test'],
                   optimizer,
                   metrics,
                   new_params,
                   model_dir,
                   progNet.target_column.data_loader.data_encoder,
                   progNet.target_column.data_loader.label_encoder,
                   restore_file=None,
                   save_model=False,
                   eval=True)

Training progressive net
Epoch 1/2
Learning Rate : [0.015]
100%|███████████████████████████████████████████████████████████████████| 456/456 [00:45<00:00, 10.13it/s, loss=67.675]
- Train metrics: accuracy: 0.783 ; f1_score: 0.322 ; precision_score: 0.539 ; recall_score: 0.230 ; loss: 189.397
- val metrics : accuracy: 0.933 ; f1_score: 0.666 ; precision_score: 0.611 ; recall_score: 0.732 ; loss: 38.025
- test metrics : accuracy: 0.934 ; f1_score: 0.656 ; precision_score: 0.603 ; recall_score: 0.718 ; loss: 37.550
- Found new best F1 score
Epoch 2/2
Learning Rate : [0.014285714285714284]
100%|███████████████████████████████████████████████████████████████████| 456/456 [00:45<00:00,  9.98it/s, loss=49.787]
- Train metrics: accuracy: 0.930 ; f1_score: 0.711 ; precision_score: 0.696 ; recall_score: 0.727 ; loss: 49.686
- val metrics : accuracy: 0.934 ; f1_score: 0.694 ; precision_score: 0.673 ; recall_score: 0.717 ; loss: 33.989
- test metrics : accuracy: 0.935 ; f1_score: 0.682 ; precision

1