Train a network for Named Entity Recognition with architecture mentioned in [].

How to run:
1. Specify the data_dir. The directory should contain the train, val, and test folders, along with the 'feats' folder obtained through feat.ipynb.
2. specify the model directory. The directory needs to be created manually. It should contain a params.json.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
root_path = '../../../'
sys.path.append(root_path)

In [23]:
import argparse
import logging
import os
import numpy as np
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from tqdm import trange
from src.ner import utils
from src.ner.model.data_loader import DataLoader
from src.ner.evaluate import evaluate
from src.booster.progressive_encoder import CharEncoder, EntityEncoder, WordEncoder
from src.booster.progNN.net import LSTMCRF

In [10]:
data_dir = os.path.join(root_path, 'src/ner/data/bc5cdr_iobes_id')
model_dir = os.path.join(root_path, 'src/ner/experiments/test/bc5cdr_testjupyter')

In [11]:
# 1. set the device to train on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
# 2. Load the parameters from json file
network_params = os.path.join(model_dir, 'params.json')
assert os.path.isfile(network_params), "No json configuration file found at {}".format(network_params)
params = utils.Params(network_params)
params.cuda = torch.cuda.is_available() # use GPU if available

In [13]:
# 3. Set the random seed for reproducible experiments
torch.manual_seed(230)
if params.cuda: torch.cuda.manual_seed(230)
np.random.seed(0)

In [14]:
# 4. Set the logger
utils.set_logger(os.path.join(model_dir, 'train.log'))

In [18]:
# 5. Create the input data pipeline
logging.info("Loading the datasets...")

from collections import OrderedDict
data_encoder = OrderedDict()
label_encoder = OrderedDict()
data_encoder[CharEncoder.FEATURE_NAME] = CharEncoder(os.path.join(data_dir, 'feats'))
data_encoder[WordEncoder.FEATURE_NAME] = WordEncoder(os.path.join(data_dir, 'feats'),
                                                     dim=params.embedding_dim)
label_encoder[EntityEncoder.FEATURE_NAME] = EntityEncoder(os.path.join(data_dir, 'feats'))

# 5.2 load data
data_loader = DataLoader(params, data_dir, data_encoder, label_encoder)
data = data_loader.load_data(['train', 'val', 'test'], data_dir)
train_data = data['train']
val_data = data['val']
test_data = data['test']

# 5.3 specify the train and val dataset sizes
params.train_size = train_data['size']
params.val_size = val_data['size']
params.test_size = test_data['size']
logging.info("- done.")
logging.info('train size: {}'.format(params.train_size))
logging.info('val size: {}'.format(params.val_size))
logging.info('test size: {}'.format(params.test_size))

Loading the datasets...
- done.
train size: 4559
val size: 4580
test size: 4796


In [25]:
model = LSTMCRF(params=params,
                char_vocab_length=data_encoder[CharEncoder.FEATURE_NAME].vocab_length,
                num_tags=label_encoder[EntityEncoder.FEATURE_NAME].num_tags,
                pretrained_word_vecs=torch.from_numpy(data_encoder[WordEncoder.FEATURE_NAME].vectors),
                dropout=params.dropout,
                decoder_type=params.decoder,
                bidirectional=params.rnn_bidirectional,
                freeze_embeddings=params.freeze_wordembeddings).to(device).float()
model_total_params = sum(p.numel() for p in model.parameters())
model_total_trainable_params = sum(p.numel() for p in filter(lambda p: p.requires_grad,
                                                             model.parameters()))
print('total params: ', model_total_params)
print('total trainable params: ', model_total_trainable_params)
optimizer = optim.SGD(params=filter(lambda p: p.requires_grad, model.parameters()),
                      lr=params.learning_rate,
                      momentum=params.momentum)

# 6.2 fetch loss function and metrics
from src.ner.evaluation import accuracy_score, f1_score, precision_score, recall_score

metrics = {'accuracy': accuracy_score,
           'f1_score': f1_score,  # micro F1 score
           'precision_score': precision_score,
           'recall_score': recall_score}

  "num_layers={}".format(dropout, num_layers))


total params:  40435014
total trainable params:  434670


In [28]:
# 7. Train the model
"""
params: restore_file: specify the model path to finetune
params: save_model: Boolean, to save model at the end of the epoch or not. Model is saved for the best F1 score on validation set
params: eval: whether to run the evaluation on the validation set or not.
"""
from src.ner.train_new import train_and_evaluate
logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
end_epoch = train_and_evaluate(model=model,
                               data_loader=data_loader,
                               train_data=train_data,
                               val_data=val_data,
                               test_data=test_data,
                               optimizer=optimizer,
                               metrics=metrics,
                               params=params,
                               model_dir=model_dir,
                               data_encoder=data_encoder,
                               label_encoder=label_encoder,
                               restore_file=None,
                               save_model=True,
                               eval=True)

Starting training for 2 epoch(s)
Epoch 1/2
Learning Rate : [0.015]
100%|███████████████████████████████████████████████████████████████████| 456/456 [00:36<00:00, 13.02it/s, loss=36.860]
- Train metrics: accuracy: 0.934 ; f1_score: 0.705 ; precision_score: 0.696 ; recall_score: 0.714 ; loss: 39.655
- Val Eval metrics : accuracy: 0.947 ; f1_score: 0.740 ; precision_score: 0.671 ; recall_score: 0.825 ; loss: 25.250
- Test Eval metrics : accuracy: 0.946 ; f1_score: 0.720 ; precision_score: 0.651 ; recall_score: 0.805 ; loss: 25.685
- Found new best F1 score
Epoch 2/2
Learning Rate : [0.014285714285714284]
100%|███████████████████████████████████████████████████████████████████| 456/456 [00:36<00:00, 12.80it/s, loss=30.773]
- Train metrics: accuracy: 0.964 ; f1_score: 0.839 ; precision_score: 0.817 ; recall_score: 0.862 ; loss: 31.079
- Val Eval metrics : accuracy: 0.952 ; f1_score: 0.771 ; precision_score: 0.706 ; recall_score: 0.848 ; loss: 22.590
- Test Eval metrics : accuracy: 0.951 ; 