In [4]:
import math
import os

import nemo
from nemo.utils.lr_policies import WarmupAnnealing

import nemo_nlp
from nemo_nlp import NemoBertTokenizer
from nemo_nlp.utils.callbacks.token_classification import \
    eval_iter_callback, eval_epochs_done_callback

BATCHES_PER_STEP = 1
BATCH_SIZE = 32
CLASSIFICATION_DROPOUT = 0.1

# To download and preprocess the data run
# python NeMo/scripts/get_tatoeba_data.py --data_dir DATA_DIR
DATA_DIR = "PATH TO WHERE THE DATA IS"

MAX_SEQ_LENGTH = 128
NUM_EPOCHS = 3
LEARNING_RATE = 0.00005
LR_WARMUP_PROPORTION = 0.1
OPTIMIZER = "adam"
PRETRAINED_BERT_MODEL = "bert-base-uncased"

# It's import to specify the none_label correctly depending on a task at hand.
# For combined punctuation and capitalization task use 'OL' for a pucntuation only model the default 'O' will work
NONE_LABEL = 'OL'

In [6]:
# Instantiate neural factory with supported backend
neural_factory = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,

    # If you're training with multiple GPUs, you should handle this value with
    # something like argparse. See examples/nlp/token_classification.py for an example.
    local_rank=None,

    # If you're training with mixed precision, this should be set to mxprO1 or mxprO2.
    # See https://nvidia.github.io/apex/amp.html#opt-levels for more details.
    optimization_level="O0",

    # If you're training with multiple GPUs, this should be set to
    # nemo.core.DeviceType.AllGpu
    placement=nemo.core.DeviceType.GPU)

In [7]:
# If you're using a standard BERT model, you should do it like this. To see the full
# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()
tokenizer = NemoBertTokenizer(pretrained_model=PRETRAINED_BERT_MODEL)
bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)

2019-12-04 16:45:09,962 - INFO - loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/ebakhturina/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
2019-12-04 16:45:09,962 - INFO - loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/ebakhturina/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
2019-12-04 16:45:09,962 - INFO - loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/ebakhturina/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
2019-12-04 16:45:10,427 - INFO - l

In [None]:
# Describe training DAG
train_data_layer = nemo_nlp.BertTokenClassificationDataLayer(tokenizer=tokenizer,
                                                             text_file=os.path.join(DATA_DIR, 'text_train.txt'),
                                                             label_file=os.path.join(DATA_DIR, 'labels_train.txt'),
                                                             max_seq_length=MAX_SEQ_LENGTH,
                                                             batch_size=BATCH_SIZE)

label_ids = train_data_layer.dataset.label_ids
num_classes = len(label_ids)

hidden_size = bert_model.local_parameters["hidden_size"]
ner_classifier = nemo_nlp.TokenClassifier(hidden_size=hidden_size,
                                          num_classes=num_classes,
                                          dropout=0.1)

ner_loss = nemo_nlp.TokenClassificationLoss(
    d_model=bert_model.bert.config.hidden_size,
    num_classes=len(tag_ids),
    dropout=CLASSIFICATION_DROPOUT)

input_ids, input_type_ids, input_mask, loss_mask, _, labels = train_data_layer()

hidden_states = bert_model(
    input_ids=input_ids,
    token_type_ids=input_type_ids,
    attention_mask=input_mask)

logits = ner_classifier(hidden_states=hidden_states)
loss = ner_loss(logits=logits, labels=labels, loss_mask=loss_mask)

In [None]:
# Describe evaluation DAG
eval_data_layer = nemo_nlp.BertTokenClassificationDataLayer(
        tokenizer=tokenizer,
        text_file=os.path.join(DATA_DIR, 'text_dev.txt'),
        label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),
        max_seq_length=MAX_SEQ_LENGTH,
        batch_size=BATCH_SIZE)

eval_input_ids, eval_input_type_ids, eval_input_mask, eval_loss_mask, eval_subtokens_mask, eval_labels \
    = eval_data_layer()

hidden_states = bert_model(
    input_ids=eval_input_ids,
    token_type_ids=eval_input_type_ids,
    attention_mask=eval_input_mask)

eval_logits = ner_classifier(hidden_states=hidden_states)

In [None]:
callback_train = nemo.core.SimpleLossLoggerCallback(
    tensors=[loss],
    print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())))

train_data_size = len(train_data_layer)

# If you're training on multiple GPUs, this should be
# train_data_size / (batch_size * batches_per_step * num_gpus)
steps_per_epoch = int(train_data_size / (BATCHES_PER_STEP * BATCH_SIZE))

callback_eval = nemo.core.EvaluatorCallback(
    eval_tensors=[eval_logits, eval_labels, eval_subtokens_mask],
    user_iter_callback=lambda x, y: eval_iter_callback(x, y),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, tag_ids),
    eval_step=steps_per_epoch)

In [None]:
lr_policy = WarmupAnnealing(NUM_EPOCHS * steps_per_epoch,
                            warmup_ratio=LR_WARMUP_PROPORTION)
optimizer = neural_factory.get_trainer()
optimizer.train(
    tensors_to_optimize=[loss],
    callbacks=[callback_train, callback_eval],
    lr_policy=lr_policy,
    batches_per_step=BATCHES_PER_STEP,
    optimizer=OPTIMIZER,
    optimization_params={
        "num_epochs": NUM_EPOCHS,
        "lr": LEARNING_RATE
    })