Train a network for Named Entity Recognition with architecture mentioned in [].

How to run:
1. Specify the data_dir. The directory should contain the train, val, and test folders, along with the 'feats' folder obtained through feat.ipynb.
2. specify the model directory. The directory needs to be created manually. It should contain a params.json.
3. Specify the pretrained model directory. It should contain model.pth file.
4. Specify the 'all_layer' parameter. True corresponds to all-layer fine-tuning, and False otherwise


In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
root_path = '../../../'
sys.path.append(root_path)

In [3]:
import argparse
import logging
import os
import numpy as np
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from tqdm import trange
from src.ner import utils
from src.ner.model.data_loader import DataLoader
from src.ner.evaluate import evaluate
from src.booster.progressive_encoder import CharEncoder, EntityEncoder, WordEncoder
from src.booster.progNN.net import LSTMCRF

In [4]:
data_dir = os.path.join(root_path, 'src/ner/data/bc5cdr_iobes_id')
model_dir = os.path.join(root_path, 'src/ner/experiments/test/test_transfer')

In [5]:
pretrained_model_dir = os.path.join(root_path, 'src/ner/experiments/disease/st/st_ncbi')
all_layer = False

In [6]:
# 1. set the device to train on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# 2. Load the parameters from json file
network_params = os.path.join(pretrained_model_dir, 'params.json') # these should be loaded from the pretrained model
new_network_params = os.path.join(model_dir, 'params.json')
assert os.path.isfile(network_params), "No json configuration file found at {}".format(network_params)
assert os.path.isfile(new_network_params), "No json configuration file found at {}".format(new_network_params)
params = utils.Params(network_params)
new_network_params = utils.Params(new_network_params)
params.cuda = torch.cuda.is_available() # use GPU if available
new_network_params.cuda = torch.cuda.is_available() # use GPU if available

In [8]:
# 3. Set the random seed for reproducible experiments
torch.manual_seed(230)
if params.cuda: torch.cuda.manual_seed(230)
np.random.seed(0)

In [9]:
# 4. Set the logger
utils.set_logger(os.path.join(model_dir, 'train.log'))

In [10]:
# 5. Create the input data pipeline
logging.info("Loading the datasets...")

from collections import OrderedDict
data_encoder = utils.load_obj(os.path.join(pretrained_model_dir, 'data_encoder.pkl'))
pretrained_label_encoder = utils.load_obj(os.path.join(pretrained_model_dir, 'label_encoder.pkl'))
label_encoder = OrderedDict()
label_encoder[EntityEncoder.FEATURE_NAME] = EntityEncoder(os.path.join(data_dir, 'feats'))

# 5.2 load data
data_loader = DataLoader(params, data_dir, data_encoder, label_encoder)
data = data_loader.load_data(['train', 'val', 'test'], data_dir)
train_data = data['train']
val_data = data['val']
test_data = data['test']

# 5.3 specify the train and val dataset sizes
new_network_params.train_size = train_data['size']
new_network_params.val_size = val_data['size']
new_network_params.test_size = test_data['size']
logging.info("- done.")
logging.info('train size: {}'.format(new_network_params.train_size))
logging.info('val size: {}'.format(new_network_params.val_size))
logging.info('test size: {}'.format(new_network_params.test_size))

Loading the datasets...
- done.
train size: 4559
val size: 4580
test size: 4796


In [11]:
# 6. Modeling
logging.info('pretrained model: {}'.format(pretrained_model_dir))
logging.info('all layer: {}'.format(all_layer))
logging.info('new model : {}'.format(model_dir))
logging.info('new data: {}'.format(data_dir))
# 6.1 Define the model
model = LSTMCRF(params=params,
                char_vocab_length=data_encoder[CharEncoder.FEATURE_NAME].vocab_length,
                num_tags=pretrained_label_encoder[EntityEncoder.FEATURE_NAME].num_tags,
                pretrained_word_vecs=torch.from_numpy(data_encoder[WordEncoder.FEATURE_NAME].vectors),
                dropout=params.dropout,
                decoder_type=params.decoder,
                bidirectional=params.rnn_bidirectional,
                freeze_embeddings=params.freeze_wordembeddings).to(device).float()
model_total_params = sum(p.numel() for p in model.parameters())

print('total params: ', model_total_params)
# load the pre-trained model
logging.info('loading pretrained model')
utils.load_checkpoint(os.path.join(pretrained_model_dir, 'best.pth'), model)
if not all_layer:
    for param in model.parameters():
        param.requires_grad = False
model.reset_layers(label_encoder[EntityEncoder.FEATURE_NAME].num_tags)
model.to(device).float()

model_total_trainable_params = sum(p.numel() for p in filter(lambda p: p.requires_grad,
                                                             model.parameters()))
print('total trainable params: ', model_total_trainable_params)

logging.info('loaded pretrained model')
optimizer = optim.SGD(params=filter(lambda p: p.requires_grad, model.parameters()),
                      lr=new_network_params.learning_rate,
                      momentum=params.momentum)

# 6.2 fetch loss function and metrics
from src.ner.evaluation import accuracy_score, f1_score, precision_score, recall_score

metrics = {'accuracy': accuracy_score,
           'f1_score': f1_score,  # micro F1 score
           'precision_score': precision_score,
           'recall_score': recall_score}

pretrained model: ../../../src/ner/experiments/disease/st/st_ncbi
all layer: False
new model : ../../../src/ner/experiments/test/test_transfer
new data: ../../../src/ner/data/bc5cdr_iobes_id
  "num_layers={}".format(dropout, num_layers))


total params:  40434058


loading pretrained model


total trainable params:  2130


loaded pretrained model


In [12]:
# 7. Train the model
"""
The evaluation is done on train, val, and test set, but the model is saved based on the val set. This is
to avoid extra overhead of running the evaluate.py script.
params: restore_file: specify the model path to finetune
params: save_model: Boolean, to save model at the end of the epoch or not. Model is saved for the best F1 score on validation set
params: eval: whether to run the evaluation on the validation set or not.
"""
from src.booster.algorithms.fine_tune import train_and_evaluate
# 7. Train the model
logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
end_epoch = train_and_evaluate(model=model,
                               data_loader=data_loader,
                               train_data=train_data,
                               val_data=val_data,
                               test_data=test_data,
                               optimizer=optimizer,
                               metrics=metrics,
                               params=new_network_params,
                               model_dir=model_dir,
                               data_encoder=data_encoder,
                               label_encoder=label_encoder,
                               restore_file=None,
                               save_model=False,
                               eval=True)

Starting training for 200 epoch(s)
Epoch 1/2
Learning Rate : [0.015]
100%|███████████████████████████████████████████████████████████████████| 456/456 [00:25<00:00, 17.62it/s, loss=67.701]
- Train metrics: accuracy: 0.693 ; f1_score: 0.218 ; precision_score: 0.417 ; recall_score: 0.147 ; loss: 198.494
- Val Eval metrics : accuracy: 0.915 ; f1_score: 0.472 ; precision_score: 0.359 ; recall_score: 0.690 ; loss: 44.932
- Test Eval metrics : accuracy: 0.915 ; f1_score: 0.455 ; precision_score: 0.348 ; recall_score: 0.659 ; loss: 45.215
- Found new best F1 score
Epoch 2/2
Learning Rate : [0.014285714285714284]
100%|███████████████████████████████████████████████████████████████████| 456/456 [00:26<00:00, 17.47it/s, loss=54.570]
- Train metrics: accuracy: 0.892 ; f1_score: 0.505 ; precision_score: 0.435 ; recall_score: 0.602 ; loss: 62.567
- Val Eval metrics : accuracy: 0.915 ; f1_score: 0.462 ; precision_score: 0.349 ; recall_score: 0.681 ; loss: 43.462
- Test Eval metrics : accuracy: 0.917