Train a network for Named Entity Recognition with architecture mentioned in [].

How to run:
1. Specify the data_dir. The directory should contain the train, val, and test folders, along with the 'feats' folder obtained through feat.ipynb.
2. specify the model directory. The directory needs to be created manually. It should contain a params.json.
3. The description of the progNet specific parameters is given below in the respective cells.


In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
root_path = '../../../'
sys.path.append(root_path)

In [3]:
import argparse
import logging
import os
import numpy as np
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from tqdm import trange
from src.ner import utils
from src.booster.progressive_data_loader import DataLoader
from src.booster.progressive_evaluate import evaluate
from src.booster.progressive_encoder import CharEncoder, EntityEncoder, WordEncoder
from src.booster.progNN.net import LSTMCRF

In [4]:
data_dir = os.path.join(root_path, 'src/ner/data/bc5cdr_iobes_id')
model_dir = os.path.join(root_path, 'src/ner/experiments/test/test_prog2col')

In [5]:
"""
freeze_prev: whether to freeze the previous column
best_prev: whether to load the best checkpoint of the previous column, or a randomly initialized network. This is to see whether there is some benefit from the PNN structure, or just through more capacity.
linear_adapter: keep it to True mostly. False places a non-linear adapter. Check ProgNet paper for more details.
best_target: whether to fine-tune the target model in the PNN way. This will load the best.pth of the target model. 
Without this, the target model is loaded randomly. Best to keep it False.

The parameters below are optimal.
"""

freeze_prev = True
best_prev = True
linear_adapter = True
best_target = False

In [6]:
"""
pretrained_model_dir: model directory to be used for 1st column.
target_mode_dir: model directory for the target column.

Specify the below parameters.
"""


pretrained_model_dir = os.path.join(root_path, 'src/ner/experiments/disease/st/st_ncbi')
target_model_dir = os.path.join(root_path, 'src/ner/experiments/disease/st/st_bc5cdr_all') # this is important only if best_target = True

In [7]:
# 1. set the device to train on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
# 2. Load the parameters from json file
pretrained_network_params = os.path.join(pretrained_model_dir, 'params.json') # these should be loaded from the pretrained model
assert os.path.isfile(pretrained_network_params), "No json configuration file found at {}".format(pretrained_network_params)
new_network_params = os.path.join(model_dir, 'params.json')
assert os.path.isfile(new_network_params), "No json configuration file found at {}".format(new_network_params)

pre_params = utils.Params(pretrained_network_params)
new_params = utils.Params(new_network_params)

# use GPU if available
pre_params.cuda = torch.cuda.is_available()
new_params.cuda = torch.cuda.is_available()

# 3. Set the random seed for reproducible experiments
torch.manual_seed(230)
if new_params.cuda: torch.cuda.manual_seed(230)
np.random.seed(0)

In [9]:
# 4. Set the logger
utils.set_logger(os.path.join(model_dir, 'train.log'))

In [10]:
# 5. Create the input data pipeline
logging.info("Loading the datasets...")
# 5.1 specify features
from collections import OrderedDict

# 5.1.1 encoders for the new model
logging.info('creating and loading data loaders')
data_encoder = OrderedDict()
data_encoder[CharEncoder.FEATURE_NAME] = CharEncoder(os.path.join(data_dir, 'feats'))
data_encoder[WordEncoder.FEATURE_NAME] = WordEncoder(os.path.join(data_dir, 'feats'),
                                                     dim=new_params.embedding_dim)
label_encoder = OrderedDict()
label_encoder[EntityEncoder.FEATURE_NAME] = EntityEncoder(os.path.join(data_dir, 'feats'))
new_params.data_feats = []
new_params.label_feats = []
for feat in data_encoder:
    new_params.data_feats.append(feat)
for feat in label_encoder:
    new_params.label_feats.append(feat)

# 5.1.2 encoders for the previous model
pretrained_data_encoder = utils.load_obj(os.path.join(pretrained_model_dir, 'data_encoder.pkl'))
pretrained_label_encoder = utils.load_obj(os.path.join(pretrained_model_dir, 'label_encoder.pkl'))
pre_params.data_feats = []
pre_params.label_feats = []
for feat in pretrained_data_encoder:
    pre_params.data_feats.append(feat)
for feat in label_encoder:
    pre_params.label_feats.append(feat)

# 5.2 load data
# 5.2.1 data loader for the new model
data_loader = DataLoader(new_params,
                         data_dir,
                         data_encoder,
                         label_encoder,
                         device=device)
pretrained_data_loader = DataLoader(pre_params,
                                    data_dir,# data_dir has to be the new data
                                    pretrained_data_encoder,
                                    device=device)

# 6. Modeling
logging.info('freeze_prev: {}'.format(str(freeze_prev)))
logging.info('best_prev: {}'.format(str(best_prev)))
logging.info('pretrained model: {}'.format(pretrained_model_dir))
logging.info('new model : {}'.format(model_dir))
logging.info('new data: {}'.format(data_dir))

Loading the datasets...
creating and loading data loaders
freeze_prev: True
best_prev: True
pretrained model: ../../../src/ner/experiments/disease/st/st_ncbi
new model : ../../../src/ner/experiments/test/test_prog2col
new data: ../../../src/ner/data/bc5cdr_iobes_id


In [11]:
# 6.1.1 Define the model architecture for 1st column
logging.info('loading previous models and creating new model')
c1_model = LSTMCRF(params=pre_params,
                   char_vocab_length=pretrained_data_encoder[CharEncoder.FEATURE_NAME].vocab_length,
                   num_tags=pretrained_label_encoder[EntityEncoder.FEATURE_NAME].num_tags,
                   pretrained_word_vecs=torch.from_numpy(pretrained_data_encoder[WordEncoder.FEATURE_NAME].vectors),
                   dropout=pre_params.dropout,
                   decoder_type=pre_params.decoder,
                   bidirectional=pre_params.rnn_bidirectional,
                   freeze_embeddings=pre_params.freeze_wordembeddings).to(device).float()
# 6.1.1.1. load the pre-trained model to fit the architecture
if best_prev:
    utils.load_checkpoint(os.path.join(pretrained_model_dir, 'best.pth'), c1_model)

# 6.1.2 Define the model for 2nd column
c2_model = LSTMCRF(params=new_params,
                   char_vocab_length=data_encoder[CharEncoder.FEATURE_NAME].vocab_length,
                   num_tags=label_encoder[EntityEncoder.FEATURE_NAME].num_tags,
                   pretrained_word_vecs=torch.from_numpy(data_encoder[WordEncoder.FEATURE_NAME].vectors),
                   dropout=new_params.dropout,
                   decoder_type=new_params.decoder,
                   bidirectional=new_params.rnn_bidirectional,
                   freeze_embeddings=new_params.freeze_wordembeddings).to(device).float()

if best_target:
    utils.load_checkpoint(os.path.join(target_model_dir, 'best.pth'), c2_model)

# 7. Convert to columns
"""
the columns must specify the name of the layers to be used for PNN transfer, in the order from input to output.
The numbers in the bracket represent the input and output dimension of the layer respectively.
"""
logging.info('creating columns')
from src.booster.progNN.column import Column
column_1 = Column(model=c1_model,
                  layers={'rnn_1': (130, 200),
                          'rnn_2': (200, 200),
                          'fc':  (200, pretrained_label_encoder[EntityEncoder.FEATURE_NAME].num_tags)},
                  data_loader=pretrained_data_loader).to(device)
column_2 = Column(model=c2_model,
                  layers={'rnn_1': (130, 200),
                          'rnn_2': (200, 200),
                          'fc':  (200, label_encoder[EntityEncoder.FEATURE_NAME].num_tags)},
                  data_loader=data_loader).to(device)

loading previous models and creating new model
  "num_layers={}".format(dropout, num_layers))
creating columns


In [12]:
# 8. create progressive net
from src.booster.progNN.adapter import Adapter

adapter = Adapter(prev_columns=[column_1], target_column=column_2, linear=linear_adapter).to(device)

logging.info('creating progressive net')
from src.booster.progNN.prognet import ProgressiveNet

progNet = ProgressiveNet(prev_columns=[column_1],
                         target_column=column_2,
                         adapter=adapter,
                         linear_adapter=linear_adapter,
                         freeze_prev=freeze_prev).to(device).float()

progNet_total_params = sum(p.numel() for p in progNet.parameters())
progNet_total_trainable_params = sum(p.numel() for p in filter(lambda p: p.requires_grad,
                                    progNet.parameters()))
logging.info('total params: {}'.format(str(progNet_total_params)))
logging.info('total trainable params: {}'.format(str(progNet_total_trainable_params)))

# 1. define the metrics
from src.ner.evaluation import accuracy_score, f1_score, precision_score, recall_score
metrics = {'accuracy': accuracy_score,
           'f1_score': f1_score,  # micro F1 score
           'precision_score': precision_score,
           'recall_score': recall_score}

# 2. Define the optimizer
optimizer = optim.SGD(params=filter(lambda p: p.requires_grad,
                                    progNet.parameters()),
                      lr=0.015,
                      momentum=0.9)

linear adapter: True
creating progressive net
total params: 80911282
total trainable params: 476880


In [13]:
# 9. Train ProgNet
from src.booster.progressive_ner import train_and_evaluate
logging.info('Training progressive net')
train_and_evaluate(progNet,
                   progNet.target_column.data_loader,
                   progNet.target_column.data['train'],
                   progNet.target_column.data['val'],
                   progNet.target_column.data['test'],
                   optimizer,
                   metrics,
                   new_params,
                   model_dir,
                   progNet.target_column.data_loader.data_encoder,
                   progNet.target_column.data_loader.label_encoder,
                   restore_file=None,
                   save_model=False,
                   eval=True)

Training progressive net
Epoch 1/2
Learning Rate : [0.015]
100%|███████████████████████████████████████████████████████████████████| 456/456 [00:40<00:00, 11.33it/s, loss=69.278]
- Train metrics: accuracy: 0.759 ; f1_score: 0.305 ; precision_score: 0.539 ; recall_score: 0.213 ; loss: 193.188
- val metrics : accuracy: 0.931 ; f1_score: 0.637 ; precision_score: 0.573 ; recall_score: 0.718 ; loss: 38.027
- test metrics : accuracy: 0.934 ; f1_score: 0.629 ; precision_score: 0.564 ; recall_score: 0.712 ; loss: 37.684
- Found new best F1 score
Epoch 2/2
Learning Rate : [0.014285714285714284]
100%|███████████████████████████████████████████████████████████████████| 456/456 [00:41<00:00, 11.08it/s, loss=49.868]
- Train metrics: accuracy: 0.928 ; f1_score: 0.664 ; precision_score: 0.635 ; recall_score: 0.695 ; loss: 51.102
- val metrics : accuracy: 0.933 ; f1_score: 0.694 ; precision_score: 0.700 ; recall_score: 0.688 ; loss: 34.859
- test metrics : accuracy: 0.931 ; f1_score: 0.676 ; precision

1