## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import random
import os
import re
import string
import argparse
import datetime
import itertools
import sys
import math
from math import ceil, floor
from random import randint
from tqdm import tqdm
tqdm.pandas()
import time
import copy
import gc
from IPython import display as ipd

# HuggingFace
from datasets import load_dataset

# PyTorch 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# For colored terminal text
from colorama import Fore, Back, Style
c_  = Fore.GREEN
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

## Utils

In [2]:
def info(t, name=''):
    print(name, '|', t.type(), '|', t.shape)


def flatten(list_in):
    return [list(itertools.chain.from_iterable(list_item)) for list_item in list_in]


def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
    return False


def get_words_num(word_sequences):
    return sum(len(word_seq) for word_seq in word_sequences)


def get_datetime_str():
    d = datetime.datetime.now()
    return '%02d_%02d_%02d_%02d-%02d_%02d' % (d.year, d.month, d.day, d.hour, d.minute, d.second)


def get_sequences_by_indices(sequences, indices):
    return [sequences[i] for i in indices]


def argsort(seq):
    return sorted(range(len(seq)), key=seq.__getitem__)


def argsort_sequences_by_lens(list_in):
    data_num = len(list_in)
    sort_indices = argsort([-len(item) for item in list_in])
    reverse_sort_indices = [-1 for _ in range(data_num)]
    for i in range(data_num):
        reverse_sort_indices[sort_indices[i]] = i
    return sort_indices, reverse_sort_indices


def log_sum_exp(x):
    max_score, _ = torch.max(x, -1)
    max_score_broadcast = max_score.unsqueeze(-1).expand_as(x)
    return max_score + torch.log(torch.sum(torch.exp(x - max_score_broadcast), -1))


def str2bool(v):
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


def get_input_arguments():
    return 'python3 main.py ' + ' '.join([arg for arg in sys.argv[1:]])

## DataIO

In [3]:
class DataIONCBI():
    """
    DataIONCBI is an input/output data wrapper for NCBI dataset.
    """
    def __init__(self, dataset_name='ncbi_disease', train_no=None, dev_no=None, test_no=None):
        self.NCBIDataset = load_dataset(dataset_name)
        self.train_no = train_no
        self.dev_no = dev_no
        self.test_no = test_no
    
    def read_train_dev_test(self, args):
        word_sequences_train, tag_sequences_train = self.read_data('train', verbose=args.verbose, exp_no=self.train_no)
        word_sequences_dev, tag_sequences_dev = self.read_data('validation', verbose=args.verbose, exp_no=self.dev_no)
        word_sequences_test, tag_sequences_test = self.read_data('test', verbose=args.verbose, exp_no=self.test_no)
        return word_sequences_train, tag_sequences_train, word_sequences_dev, tag_sequences_dev, word_sequences_test, tag_sequences_test

    def read_data(self, mode, verbose=True, exp_no=None):
        dataset = self.NCBIDataset[mode]
        word_sequences = list()
        tag_sequences = list()
        for i, row in enumerate(dataset):
            if len(row['tokens']) == 0 or len(row['ner_tags']) == 0:
                continue
            word_sequences.append(row['tokens'])
            tag_sequences.append(row['ner_tags'])
            if exp_no:
                if i>= exp_no-1:
                    break
            
        if verbose:
            print('Loading from %s: %d samples, %d words.' % (mode, len(word_sequences), get_words_num(word_sequences)))
        return word_sequences, tag_sequences

## Dataset

In [4]:
class DatasetsBank():
    """DatasetsBank provides storing the train/dev/test data subsets and sampling batches from the train dataset."""
    def __init__(self, verbose=True):
        self.verbose = verbose
        self.unique_words_list = list()

    def __add_to_unique_words_list(self, word_sequences):
        for word_seq in word_sequences:
            for word in word_seq:
                if word not in self.unique_words_list:
                    self.unique_words_list.append(word)
        if self.verbose:
            print('DatasetsBank: len(unique_words_list) = %d unique words.' % (len(self.unique_words_list)))

    def add_train_sequences(self, word_sequences_train, tag_sequences_train):
        self.train_data_num = len(word_sequences_train)
        self.word_sequences_train = word_sequences_train
        self.tag_sequences_train = tag_sequences_train
        self.__add_to_unique_words_list(word_sequences_train)

    def add_dev_sequences(self, word_sequences_dev, tag_sequences_dev):
        self.word_sequences_dev = word_sequences_dev
        self.tag_sequences_dev = tag_sequences_dev
        self.__add_to_unique_words_list(word_sequences_dev)

    def add_test_sequences(self, word_sequences_test, tag_sequences_test):
        self.word_sequences_test = word_sequences_test
        self.tag_sequences_test = tag_sequences_test
        self.__add_to_unique_words_list(word_sequences_test)

    def __get_train_batch(self, batch_indices):
        word_sequences_train_batch = [self.word_sequences_train[i] for i in batch_indices]
        tag_sequences_train_batch = [self.tag_sequences_train[i] for i in batch_indices]
        return word_sequences_train_batch, tag_sequences_train_batch

    def get_train_batches(self, batch_size):
        random_indices = np.random.permutation(np.arange(self.train_data_num))
        for k in range(self.train_data_num // batch_size): # oh yes, we drop the last batch
            batch_indices = random_indices[k:k + batch_size].tolist()
            word_sequences_train_batch, tag_sequences_train_batch = self.__get_train_batch(batch_indices)
            yield word_sequences_train_batch, tag_sequences_train_batch


class DatasetsBankSorted():
    def __init__(self, verbose=True):
        self.verbose = verbose
        self.unique_words_list = list()

    def __add_to_unique_words_list(self, word_sequences):
        for word_seq in word_sequences:
            for word in word_seq:
                if word not in self.unique_words_list:
                    self.unique_words_list.append(word)
        if self.verbose:
            print('DatasetsBank: len(unique_words_list) = %d unique words.' % (len(self.unique_words_list)))

    def add_train_sequences(self, word_sequences_train, tag_sequences_train):
        sort_indices, _ = argsort_sequences_by_lens(word_sequences_train)
        self.word_sequences_train = get_sequences_by_indices(word_sequences_train, sort_indices)
        self.tag_sequences_train = get_sequences_by_indices(tag_sequences_train, sort_indices)
        self.train_data_num = len(word_sequences_train)
        self.__add_to_unique_words_list(word_sequences_train)

    def add_dev_sequences(self, word_sequences_dev, tag_sequences_dev):
        self.word_sequences_dev = word_sequences_dev
        self.tag_sequences_dev = tag_sequences_dev
        self.__add_to_unique_words_list(word_sequences_dev)

    def add_test_sequences(self, word_sequences_test, tag_sequences_test):
        self.word_sequences_test = word_sequences_test
        self.tag_sequences_test = tag_sequences_test
        self.__add_to_unique_words_list(word_sequences_test)

    def __get_train_batch(self, batch_size, batch_no, rand_seed=0):
        i = batch_no * batch_size + rand_seed
        j = min((batch_no + 1) * batch_size, self.train_data_num + 1) + rand_seed
        return self.word_sequences_train[i:j], self.tag_sequences_train[i:j]

    def get_train_batches(self, batch_size):
        rand_seed = randint(0, batch_size - 1)
        batch_num = self.train_data_num // batch_size
        random_indices = np.random.permutation(np.arange(batch_num - 1)).tolist()
        for k in random_indices:
            yield self.__get_train_batch(batch_size, batch_no=k, rand_seed=rand_seed)

    def __get_train_batch_regularized(self, batch_size, rand_batch_size, batch_no):
        i = batch_no * batch_size
        j = min((batch_no + 1) * batch_size, self.train_data_num + 1)
        word_sequences_train_batch = self.word_sequences_train[i:j]
        tag_sequences_train_batch = self.tag_sequences_train[i:j]
        for k in range(rand_batch_size):
            r = randint(0, self.train_data_num)
            word_sequences_train_batch.append(self.word_sequences_train[r])
            tag_sequences_train_batch.append(self.tag_sequences_train[r])
        return word_sequences_train_batch, tag_sequences_train_batch

    def get_train_batches_regularized(self, batch_size):
        batch_num = self.train_data_num // batch_size
        random_indices = np.random.permutation(np.arange(batch_num)).tolist()
        for k in random_indices:
            yield self.__get_train_batch_regularized(batch_size-2, rand_batch_size=2, batch_no=k)

## Sequence Indexers

In [5]:
class SeqIndexerBase():
    """
    SeqIndexerBase is a base abstract class for sequence indexers. It converts list of lists of string items
    to the list of lists of integer indices and back. Items could be either words, tags or characters.
    """
    def __init__(self, gpu=-1, check_for_lowercase=True, zero_digits=False, pad='<pad>', unk='<unk>',
                 load_embeddings=False, embeddings_dim=0, verbose=False):
        self.gpu = gpu
        self.check_for_lowercase = check_for_lowercase
        self.zero_digits = zero_digits
        self.pad = pad
        self.unk = unk
        self.load_embeddings = load_embeddings
        self.embeddings_dim = embeddings_dim
        self.verbose = verbose
        self.out_of_vocabulary_list = list()
        self.item2idx_dict = dict()
        self.idx2item_dict = dict()
        if load_embeddings:
            self.embeddings_loaded = False
            self.embedding_vectors_list = list()
        if pad is not None:
            self.pad_idx = self.add_item(pad)
            if load_embeddings:
                self.add_emb_vector(self.generate_zero_emb_vector())
        if unk is not None:
            self.unk_idx = self.add_item(unk)
            if load_embeddings:
                self.add_emb_vector(self.generate_random_emb_vector())

    def get_items_list(self):
        return list(self.item2idx_dict.keys())

    def get_items_count(self):
        return len(self.get_items_list())

    def item_exists(self, item):
        return item in self.item2idx_dict.keys()

    def add_item(self, item):
        idx = len(self.get_items_list())
        self.item2idx_dict[item] = idx
        self.idx2item_dict[idx] = item
        return idx

    def get_class_num(self):
        if self.pad is not None and self.unk is not None:
            return self.get_items_count() - 2
        if self.pad is not None or self.unk is not None:
            return self.get_items_count() - 1
        return self.get_items_count()

    def items2idx(self, item_sequences):
        idx_sequences = []
        for item_seq in item_sequences:
            idx_seq = list()
            for item in item_seq:
                if item in self.item2idx_dict:
                    idx_seq.append(self.item2idx_dict[item])
                else:
                    if self.unk is not None:
                        idx_seq.append(self.item2idx_dict[self.unk])
                    else:
                        idx_seq.append(self.item2idx_dict[self.pad])
            idx_sequences.append(idx_seq)
        return idx_sequences

    def idx2items(self, idx_sequences):
        item_sequences = []
        for idx_seq in idx_sequences:
            item_seq = [self.idx2item_dict[idx] for idx in idx_seq]
            item_sequences.append(item_seq)
        return item_sequences

    def items2tensor(self, item_sequences, align='left', word_len=-1):
        idx = self.items2idx(item_sequences)
        return self.idx2tensor(idx, align, word_len)

    def idx2tensor(self, idx_sequences, align='left', word_len=-1):
        batch_size = len(idx_sequences)
        if word_len == -1:
            word_len = max([len(idx_seq) for idx_seq in idx_sequences])
        tensor = torch.zeros(batch_size, word_len, dtype=torch.long)
        for k, idx_seq in enumerate(idx_sequences):
            curr_seq_len = len(idx_seq)
            if curr_seq_len > word_len:
                idx_seq = [idx_seq[i] for i in range(word_len)]
                curr_seq_len = word_len
            if align == 'left':
                tensor[k, :curr_seq_len] = torch.LongTensor(np.asarray(idx_seq))
            elif align == 'center':
                start_idx = (word_len - curr_seq_len) // 2
                tensor[k, start_idx:start_idx+curr_seq_len] = torch.LongTensor(np.asarray(idx_seq))
            else:
                raise ValueError('Unknown align string.')
        if self.gpu >= 0:
            tensor = tensor.cuda(device=self.gpu)
        return tensor

In [6]:
class SeqIndexerBaseEmbeddings(SeqIndexerBase):
    """
    SeqIndexerBaseEmbeddings is a basic abstract sequence indexers class that implements work qith embeddings.
    """
    def __init__(self, gpu, check_for_lowercase, zero_digits, pad, unk, load_embeddings, embeddings_dim, verbose):
        SeqIndexerBase.__init__(self, gpu, check_for_lowercase, zero_digits, pad, unk, load_embeddings, embeddings_dim,
                                verbose)
    @staticmethod
    def load_embeddings_from_file(emb_fn, emb_delimiter, verbose=True):
        for k, line in enumerate(open(emb_fn, 'r')):
            values = line.split(emb_delimiter)
            if len(values) < 5:
                continue
            word = values[0]
            emb_vector = list(map(lambda t: float(t), filter(lambda n: n and not n.isspace(), values[1:])))
            if verbose:
                if k % 100000 == 0:
                    print('Reading embeddings file %s, line = %d' % (emb_fn, k))
            yield word, emb_vector

    def generate_zero_emb_vector(self):
        if self.embeddings_dim == 0:
            raise ValueError('embeddings_dim is not known.')
        return [0 for _ in range(self.embeddings_dim)]

    def generate_random_emb_vector(self):
        if self.embeddings_dim == 0:
            raise ValueError('embeddings_dim is not known.')
        return np.random.uniform(-np.sqrt(3.0 / self.embeddings_dim), np.sqrt(3.0 / self.embeddings_dim),
                                 self.embeddings_dim).tolist()

    def add_emb_vector(self, emb_vector):
        self.embedding_vectors_list.append(emb_vector)

    def get_loaded_embeddings_tensor(self):
        return torch.FloatTensor(np.asarray(self.embedding_vectors_list))

In [7]:
class SeqIndexerWord(SeqIndexerBaseEmbeddings):
    """SeqIndexerWord converts list of lists of words as strings to list of lists of integer indices and back."""
    def __init__(self, gpu=-1, check_for_lowercase=True, embeddings_dim=0, verbose=True):
        SeqIndexerBaseEmbeddings.__init__(self, gpu=gpu, check_for_lowercase=check_for_lowercase, zero_digits=True,
                                          pad='<pad>', unk='<unk>', load_embeddings=True, embeddings_dim=embeddings_dim,
                                          verbose=verbose)
        self.original_words_num = 0
        self.lowercase_words_num = 0
        self.zero_digits_replaced_num = 0
        self.zero_digits_replaced_lowercase_num = 0
        self.capitalize_word_num = 0
        self.uppercase_word_num = 0

    def load_items_from_embeddings_file_and_unique_words_list(self, emb_fn, emb_delimiter, emb_load_all,
                                                              unique_words_list):
        embeddings_full_list = SeqIndexerBaseEmbeddings.load_embeddings_from_file(emb_fn,emb_delimiter,verbose=True)
        # Get the full list of available case-sensitive words from text file with pretrained embeddings
        
        embeddings_words_list = [emb_word for emb_word, _ in embeddings_full_list]
        # Create reverse mapping word from the embeddings file -> list of unique words from the dataset
        emb_word_dict2unique_word_list = dict()
        out_of_vocabulary_words_list = list()
        for unique_word in unique_words_list:
            emb_word = self.get_embeddings_word(unique_word, embeddings_words_list)
            if emb_word is None:
                out_of_vocabulary_words_list.append(unique_word)
            else:
                if emb_word not in emb_word_dict2unique_word_list:
                    emb_word_dict2unique_word_list[emb_word] = [unique_word]
                else:
                    emb_word_dict2unique_word_list[emb_word].append(unique_word)
        # Add pretrained embeddings for unique_words
        for emb_word, emb_vec in embeddings_full_list:
            if emb_word in emb_word_dict2unique_word_list:
                for unique_word in emb_word_dict2unique_word_list[emb_word]:
                    self.add_word_emb_vec(unique_word, emb_vec)
        if self.verbose:
            print('\nload_vocabulary_from_embeddings_file_and_unique_words_list:')
            print('    First 50 OOV words:')
            for i, oov_word in enumerate(out_of_vocabulary_words_list):
                print('        out_of_vocabulary_words_list[%d] = %s' % (i, oov_word))
                if i > 49:
                    break
            print(' -- len(out_of_vocabulary_words_list) = %d' % len(out_of_vocabulary_words_list))
            print(' -- original_words_num = %d' % self.original_words_num)
            print(' -- lowercase_words_num = %d' % self.lowercase_words_num)
            print(' -- zero_digits_replaced_num = %d' % self.zero_digits_replaced_num)
            print(' -- zero_digits_replaced_lowercase_num = %d' % self.zero_digits_replaced_lowercase_num)
        # Load all embeddings
        if emb_load_all:
            loaded_words_list = self.get_items_list()
            load_all_words_num_before = len(loaded_words_list)
            load_all_words_lower_num = 0
            load_all_words_upper_num = 0
            load_all_words_capitalize_num = 0
            for emb_word, emb_vec in embeddings_full_list:
                if emb_word in loaded_words_list:
                    continue
                if emb_word.lower() not in loaded_words_list and emb_word.lower() not in embeddings_words_list:
                    self.add_word_emb_vec(emb_word.lower(), emb_vec)
                    load_all_words_lower_num += 1
                if emb_word.upper() not in loaded_words_list and emb_word.upper() not in embeddings_words_list:
                    self.add_word_emb_vec(emb_word.upper(), emb_vec)
                    load_all_words_upper_num += 1
                if emb_word.capitalize() not in loaded_words_list and emb_word.capitalize() not in \
                        embeddings_words_list:
                    self.add_word_emb_vec(emb_word.capitalize(), emb_vec)
                    load_all_words_capitalize_num += 1
                self.add_item(emb_word)
                self.add_emb_vector(emb_vec)
            load_all_words_num_after = len(self.get_items_list())
            if self.verbose:
                print(' ++ load_all_words_num_before = %d ' % load_all_words_num_before)
                print(' ++ load_all_words_lower_num = %d ' % load_all_words_lower_num)
                print(' ++ load_all_words_num_after = %d ' % load_all_words_num_after)

    def get_embeddings_word(self, word, embeddings_word_list):
        if word in embeddings_word_list:
            self.original_words_num += 1
            return word
        elif self.check_for_lowercase and word.lower() in embeddings_word_list:
            self.lowercase_words_num += 1
            return word.lower()
        elif self.zero_digits and re.sub('\d', '0', word) in embeddings_word_list:
            self.zero_digits_replaced_num += 1
            return re.sub('\d', '0', word)
        elif self.check_for_lowercase and self.zero_digits and re.sub('\d', '0', word.lower()) in embeddings_word_list:
            self.zero_digits_replaced_lowercase_num += 1
            return re.sub('\d', '0', word.lower())
        return None

    def add_word_emb_vec(self, word, emb_vec):
        self.add_item(word)
        self.add_emb_vector(emb_vec)

    def get_unique_characters_list(self, verbose=False, init_by_printable_characters=True):
        if init_by_printable_characters:
            unique_characters_set = set(string.printable)
        else:
            unique_characters_set = set()
        if verbose:
            cnt = 0
        for n, word in enumerate(self.get_items_list()):
            len_delta = len(unique_characters_set)
            unique_characters_set = unique_characters_set.union(set(word))
            if verbose and len(unique_characters_set) > len_delta:
                cnt += 1
                print('n = %d/%d (%d) %s' % (n, len(self.get_items_list), cnt, word))
        return list(unique_characters_set)

In [8]:
class SeqIndexerTag(SeqIndexerBase):
    """SeqIndexerTag converts list of lists of string tags to list of lists of integer indices and back."""
    def __init__(self, gpu):
        SeqIndexerBase.__init__(self, gpu=gpu, check_for_lowercase=False, zero_digits=False,
                                      pad='<pad>', unk=None, load_embeddings=False, verbose=True)

    def add_tag(self, tag):
        if not self.item_exists(tag):
            self.add_item(tag)

    def load_items_from_tag_sequences(self, tag_sequences):
        assert self.load_embeddings == False
        for tag_seq in tag_sequences:
            for tag in tag_seq:
                self.add_tag(tag)
        if self.verbose:
            print('\nload_vocabulary_from_tag_sequences:')
            print(' -- class_num = %d' % self.get_class_num())
            print(' --', self.item2idx_dict)

In [9]:
class SeqIndexerBaseChar(SeqIndexerBaseEmbeddings):
    """SeqIndexerBaseChar converts list of lists of characters to list of lists of integer indices and back."""
    def __init__(self, gpu):
        SeqIndexerBaseEmbeddings.__init__(self, gpu=gpu, check_for_lowercase=False, zero_digits=False, pad='<pad>',
                                          unk='<unk>', load_embeddings=False, embeddings_dim=0, verbose=True)

    def add_char(self, c):
        if not self.item_exists(c):
            self.add_item(c)

    def get_char_tensor(self, curr_char_seq, word_len):
        return SeqIndexerBaseEmbeddings.items2tensor(self, curr_char_seq, align='center', word_len=word_len)  # curr_seq_len x word_len

## Layers

In [10]:
class LayerBase(nn.Module):
    """Abstract base class for all type of layers."""
    def __init__(self, gpu):
        super(LayerBase, self).__init__()
        self.gpu = gpu

    def tensor_ensure_gpu(self, tensor):
        if self.is_cuda():
            return tensor.cuda(device=self.gpu)
        else:
            return tensor.cpu()

    def apply_mask(self, input_tensor, mask_tensor):
        input_tensor = self.tensor_ensure_gpu(input_tensor)
        mask_tensor = self.tensor_ensure_gpu(mask_tensor)
        return input_tensor*mask_tensor.unsqueeze(-1).expand_as(input_tensor)

    def get_seq_len_list_from_mask_tensor(self, mask_tensor):
        batch_size = mask_tensor.shape[0]
        return [int(mask_tensor[k].sum().item()) for k in range(batch_size)]

In [11]:
class LayerBiRNNBase(LayerBase):
    """LayerBiRNNBase is abstract base class for all bidirectional recurrent layers."""
    def __init__(self, input_dim, hidden_dim, gpu):
        super(LayerBiRNNBase, self).__init__(gpu)
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = hidden_dim * 2

    def sort_by_seq_len_list(self, seq_len_list):
        data_num = len(seq_len_list)
        sort_indices = sorted(range(len(seq_len_list)), key=seq_len_list.__getitem__, reverse=True)
        reverse_sort_indices = [-1 for _ in range(data_num)]
        for i in range(data_num):
            reverse_sort_indices[sort_indices[i]] = i
        sort_index = self.tensor_ensure_gpu(torch.tensor(sort_indices, dtype=torch.long))
        reverse_sort_index = self.tensor_ensure_gpu(torch.tensor(reverse_sort_indices, dtype=torch.long))
        return sorted(seq_len_list, reverse=True), sort_index, reverse_sort_index

    def pack(self, input_tensor, mask_tensor):
        seq_len_list = self.get_seq_len_list_from_mask_tensor(mask_tensor)
        sorted_seq_len_list, sort_index, reverse_sort_index = self.sort_by_seq_len_list(seq_len_list)
        input_tensor_sorted = torch.index_select(input_tensor, dim=0, index=sort_index)
        return pack_padded_sequence(input_tensor_sorted, lengths=sorted_seq_len_list, batch_first=True), \
               reverse_sort_index

    def unpack(self, output_packed, max_seq_len, reverse_sort_index):
        output_tensor_sorted, _ = pad_packed_sequence(output_packed, batch_first=True, total_length=max_seq_len)
        output_tensor = torch.index_select(output_tensor_sorted, dim=0, index=reverse_sort_index)
        return output_tensor

In [12]:
class LayerWordEmbeddings(LayerBase):
    """LayerWordEmbeddings implements word embeddings."""
    def __init__(self, word_seq_indexer, gpu, freeze_word_embeddings=False, pad_idx=0):
        super(LayerWordEmbeddings, self).__init__(gpu)
        embeddings_tensor = word_seq_indexer.get_loaded_embeddings_tensor()
        self.embeddings = nn.Embedding.from_pretrained(embeddings=embeddings_tensor, freeze=freeze_word_embeddings)
        self.embeddings.padding_idx = pad_idx
        self.word_seq_indexer = word_seq_indexer
        self.freeze_embeddings = freeze_word_embeddings
        self.embeddings_num = embeddings_tensor.shape[0]
        self.embeddings_dim = embeddings_tensor.shape[1]
        self.output_dim = self.embeddings_dim

    def is_cuda(self):
        return self.embeddings.weight.is_cuda

    def forward(self, word_sequences):
        input_tensor = self.tensor_ensure_gpu(self.word_seq_indexer.items2tensor(word_sequences)) # shape: batch_size x max_seq_len
        word_embeddings_feature = self.embeddings(input_tensor) # shape: batch_size x max_seq_len x output_dim
        return word_embeddings_feature

In [13]:
class LayerCharEmbeddings(LayerBase):
    """LayerCharEmbeddings implements character-level embeddings."""
    def __init__(self, gpu, char_embeddings_dim, freeze_char_embeddings=False, word_len=20, unique_characters_list=None):
        super(LayerCharEmbeddings, self).__init__(gpu)
        self.gpu = gpu
        self.char_embeddings_dim = char_embeddings_dim
        self.freeze_char_embeddings = freeze_char_embeddings
        self.word_len = word_len # standard len to pad
        # Init character sequences indexer
        self.char_seq_indexer = SeqIndexerBaseChar(gpu=gpu)
        if unique_characters_list is None:
            unique_characters_list = list(string.printable)
        for c in unique_characters_list:
            self.char_seq_indexer.add_char(c)
        # Init character embedding
        self.embeddings = nn.Embedding(num_embeddings=self.char_seq_indexer.get_items_count(),
                                       embedding_dim=char_embeddings_dim,
                                       padding_idx=0)
        # nn.init.uniform_(self.embeddings.weight, -0.5, 0.5) # Option: Ma, 2016

    def is_cuda(self):
        return self.embeddings.weight.is_cuda

    def forward(self, word_sequences):
        batch_num = len(word_sequences)
        max_seq_len = max([len(word_seq) for word_seq in word_sequences])
        char_sequences = [[[c for c in word] for word in word_seq] for word_seq in word_sequences]
        input_tensor = self.tensor_ensure_gpu(torch.zeros(batch_num, max_seq_len, self.word_len, dtype=torch.long))
        for n, curr_char_seq in enumerate(char_sequences):
            curr_seq_len = len(curr_char_seq)
            curr_char_seq_tensor = self.char_seq_indexer.get_char_tensor(curr_char_seq, self.word_len) # curr_seq_len x word_len
            input_tensor[n, :curr_seq_len, :] = curr_char_seq_tensor
        char_embeddings_feature = self.embeddings(input_tensor)
        return char_embeddings_feature.permute(0, 1, 3, 2) # shape: batch_num x max_seq_len x char_embeddings_dim x word_len

In [14]:
class LayerCharCNN(LayerBase):
    """LayerCharCNN implements character-level convolutional 1D layer."""
    def __init__(self, gpu, char_embeddings_dim, filter_num, char_window_size, word_len):
        super(LayerCharCNN, self).__init__(gpu)
        self.char_embeddings_dim = char_embeddings_dim
        self.char_cnn_filter_num = filter_num
        self.char_window_size = char_window_size
        self.word_len = word_len
        self.output_dim = char_embeddings_dim * filter_num
        self.conv1 = nn.Conv1d(in_channels=char_embeddings_dim,
                               out_channels=char_embeddings_dim,
                               kernel_size=char_window_size[0],
                               groups=char_embeddings_dim,
                               padding="same")
        self.conv2 = nn.Conv1d(in_channels=char_embeddings_dim,
                               out_channels=char_embeddings_dim,
                               kernel_size=char_window_size[1],
                               groups=char_embeddings_dim, 
                               padding="same")
        self.conv3 = nn.Conv1d(in_channels=char_embeddings_dim,
                               out_channels=char_embeddings_dim,
                               kernel_size=char_window_size[2],
                               groups=char_embeddings_dim, 
                               padding="same")

    def is_cuda(self):
        return self.conv1.weight.is_cuda

    def forward(self, char_embeddings_feature): # batch_num x max_seq_len x char_embeddings_dim x word_len
        batch_num, max_seq_len, char_embeddings_dim, word_len = char_embeddings_feature.shape
        max_pooling_out = self.tensor_ensure_gpu(torch.zeros(batch_num, max_seq_len, self.output_dim, dtype=torch.float))
        for k in range(max_seq_len):
            conv_out1 = self.conv1(char_embeddings_feature[:, k, :, :])
            conv_out2 = self.conv2(char_embeddings_feature[:, k, :, :])
            conv_out3 = self.conv3(char_embeddings_feature[:, k, :, :])
            conv_out = torch.cat((conv_out1, conv_out2, conv_out3), dim=1)
            max_pooling_out[:, k, :], _ = torch.max(conv_out, dim=2)
        return max_pooling_out # shape: batch_num x max_seq_len x filter_num*char_embeddings_dim

In [15]:
class LayerCharBiLSTM(LayerBase):
    """LayerCharCNN implements character-level convolutional 1D layer."""
    def __init__(self, gpu, char_embeddings_dim, char_hidden_dim):
        super(LayerCharBiLSTM, self).__init__(gpu)
        self.char_embeddings_dim = char_embeddings_dim
        self.char_hidden_dim = char_hidden_dim
        self.output_dim = 2 * char_hidden_dim
        self.lstm = nn.LSTM(input_size=char_embeddings_dim,
                            hidden_size=char_hidden_dim,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)

    def is_cuda(self):
        return self.lstm.weight_hh_l0.is_cuda

    def forward(self, char_embeddings_feature): # batch_num x max_seq_len x char_embeddings_dim x word_len
        batch_num, max_seq_len, char_embeddings_dim, word_len = char_embeddings_feature.shape
        output_tensor = self.tensor_ensure_gpu(torch.zeros(batch_num, max_seq_len, self.output_dim, dtype=torch.float))
        for k in range(max_seq_len):
            input_packed = char_embeddings_feature[:,k,:,:].permute(0,2,1)
            output_pack, _ =  self.lstm(input_packed)
            output_tensor[:,k,:] = output_pack[:,-1,:]
        return output_tensor  # shape: batch_size x max_seq_len x hidden_dim*2

In [16]:
class LayerBiLSTM(LayerBiRNNBase):
    """BiLSTM layer implements standard bidirectional LSTM recurrent layer"""
    def __init__(self, input_dim, hidden_dim, gpu):
        super(LayerBiLSTM, self).__init__(input_dim, hidden_dim, gpu)
        self.num_layers = 1
        self.num_directions = 2
        rnn = nn.LSTM(input_size=input_dim,
                      hidden_size=hidden_dim,
                      num_layers=1,
                      batch_first=True,
                      bidirectional=True)
        self.rnn = rnn

    def lstm_custom_init(self):
        nn.init.xavier_uniform_(self.rnn.weight_hh_l0)
        nn.init.xavier_uniform_(self.rnn.weight_hh_l0_reverse)
        nn.init.xavier_uniform_(self.rnn.weight_ih_l0)
        nn.init.xavier_uniform_(self.rnn.weight_ih_l0_reverse)
        self.rnn.bias_hh_l0.data.fill_(0)
        self.rnn.bias_hh_l0_reverse.data.fill_(0)
        self.rnn.bias_ih_l0.data.fill_(0)
        self.rnn.bias_ih_l0_reverse.data.fill_(0)
        # Init forget gates to 1
        for names in self.rnn._all_weights:
            for name in filter(lambda n: 'bias' in n, names):
                bias = getattr(self.rnn, name)
                n = bias.size(0)
                start, end = n // 4, n // 2
                bias.data[start:end].fill_(1.)

    def forward(self, input_tensor, mask_tensor): #input_tensor shape: batch_size x max_seq_len x dim
        batch_size, max_seq_len, _ = input_tensor.shape
        input_packed, reverse_sort_index = self.pack(input_tensor, mask_tensor)
        h0 = self.tensor_ensure_gpu(torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_dim))
        c0 = self.tensor_ensure_gpu(torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_dim))
        output_packed, _ = self.rnn(input_packed, (h0, c0))
        output_tensor = self.unpack(output_packed, max_seq_len, reverse_sort_index)
        return output_tensor  # shape: batch_size x max_seq_len x hidden_dim*2

    def is_cuda(self):
        return self.rnn.weight_hh_l0.is_cuda

In [17]:
class LayerAttention(LayerBase):
    def __init__(self, gpu, hidden_dim):
        super(LayerAttention, self).__init__(gpu)
        self.hidden_dim = hidden_dim
        self.att_weights = nn.Parameter(torch.Tensor(1, self.hidden_dim))
        self.output_dim = hidden_dim
        stdv = 1.0 / np.sqrt(self.hidden_dim)
        for weight in self.att_weights:
            nn.init.uniform_(weight, -stdv, stdv)

    def is_cuda(self):
        return self.att_weights.is_cuda

    def forward(self, input_tensor, mask_tensor):
        batch_size, max_len = input_tensor.size()[:2]
        # apply attention layer
        weights = torch.bmm(input_tensor,
                            self.att_weights  # (1, hidden_dim)
                            .permute(1, 0)  # (hidden_dim, 1)
                            .unsqueeze(0)  # (1, hidden_dim, 1)
                            .repeat(batch_size, 1, 1) # (batch_size, hidden_dim, 1)
                            ) # (batch_size, max_seq_len, 1)
        attentions = torch.softmax(F.relu(weights.squeeze()), dim=-1)
        # apply mask and renormalize attention scores (weights)
        masked = attentions * mask_tensor
        _sums = masked.sum(-1).unsqueeze(-1)  # sums per row
        attentions = masked.div(_sums)
        # apply attention weights
        weighted = torch.mul(input_tensor, attentions.unsqueeze(-1).expand_as(input_tensor))
        # get the final fixed vector representations of the sentences
        representations = weighted.sum(1).squeeze()
        return weighted

In [18]:
class LayerCRF(LayerBase):
    """LayerCRF implements Conditional Random Fields (Ma.et.al., 2016 style)"""
    def __init__(self, gpu, states_num, pad_idx, sos_idx, tag_seq_indexer, verbose=True):
        super(LayerCRF, self).__init__(gpu)
        self.states_num = states_num
        self.pad_idx = pad_idx
        self.sos_idx = sos_idx
        self.tag_seq_indexer = tag_seq_indexer
        self.tag_seq_indexer.add_tag('<sos>')
        self.verbose = verbose
        # Transition matrix contains log probabilities from state j to state i
        self.transition_matrix = nn.Parameter(torch.zeros(states_num, states_num, dtype=torch.float))
        nn.init.normal_(self.transition_matrix, -1, 0.1)
        # Default initialization
        self.transition_matrix.data[self.sos_idx, :] = -9999.0
        self.transition_matrix.data[:, self.pad_idx] = -9999.0
        self.transition_matrix.data[self.pad_idx, :] = -9999.0
        self.transition_matrix.data[self.pad_idx, self.pad_idx] = 0.0

    def get_empirical_transition_matrix(self, tag_sequences_train, tag_seq_indexer=None):
        if tag_seq_indexer is None:
            tag_seq_indexer = self.tag_seq_indexer
        empirical_transition_matrix = torch.zeros(self.states_num, self.states_num, dtype=torch.long)
        for tag_seq in tag_sequences_train:
            try:
                s = tag_seq_indexer.item2idx_dict[tag_seq[0]]
            except:
                print(tag_seq)
            empirical_transition_matrix[s, self.sos_idx] += 1
            for n, tag in enumerate(tag_seq):
                if n + 1 >= len(tag_seq):
                    break
                next_tag = tag_seq[n + 1]
                j = tag_seq_indexer.item2idx_dict[tag]
                i = tag_seq_indexer.item2idx_dict[next_tag]
                empirical_transition_matrix[i, j] += 1
        return empirical_transition_matrix

    def init_transition_matrix_empirical(self, tag_sequences_train):
        # Calculate statistics for tag transitions
        empirical_transition_matrix = self.get_empirical_transition_matrix(tag_sequences_train)
        # Initialize
        for i in range(self.tag_seq_indexer.get_items_count()):
            for j in range(self.tag_seq_indexer.get_items_count()):
                if empirical_transition_matrix[i, j] == 0:
                    self.transition_matrix.data[i, j] = -9999.0
                #self.transition_matrix.data[i, j] = torch.log(empirical_transition_matrix[i, j].float() + 10**-32)
        if self.verbose:
            print('Empirical transition matrix from the train dataset:')
            self.pretty_print_transition_matrix(empirical_transition_matrix)
            print('\nInitialized transition matrix:')
            self.pretty_print_transition_matrix(self.transition_matrix.data)

    def pretty_print_transition_matrix(self, transition_matrix, tag_seq_indexer=None):
        if tag_seq_indexer is None:
            tag_seq_indexer = self.tag_seq_indexer
        str = '%10s' % ''
        for i in range(tag_seq_indexer.get_items_count()):
            str += '%10s' % tag_seq_indexer.idx2item_dict[i]
        str += '\n'
        for i in range(tag_seq_indexer.get_items_count()):
            str += '\n%10s' % tag_seq_indexer.idx2item_dict[i]
            for j in range(tag_seq_indexer.get_items_count()):
                str += '%10s' % ('%1.1f' % transition_matrix[i, j])
        print(str)

    def is_cuda(self):
        return self.transition_matrix.is_cuda

    def numerator(self, features_rnn_compressed, states_tensor, mask_tensor):
        # features_input_tensor: batch_num x max_seq_len x states_num
        # states_tensor: batch_num x max_seq_len
        # mask_tensor: batch_num x max_seq_len
        batch_num, max_seq_len = mask_tensor.shape
        score = self.tensor_ensure_gpu(torch.zeros(batch_num, dtype=torch.float))
        start_states_tensor = self.tensor_ensure_gpu(torch.zeros(batch_num, 1, dtype=torch.long).fill_(self.sos_idx))
        states_tensor = torch.cat([start_states_tensor, states_tensor], 1)
        for n in range(max_seq_len):
            curr_mask = mask_tensor[:, n]
            curr_emission = self.tensor_ensure_gpu(torch.zeros(batch_num, dtype=torch.float))
            curr_transition = self.tensor_ensure_gpu(torch.zeros(batch_num, dtype=torch.float))
            for k in range(batch_num):
                curr_emission[k] = features_rnn_compressed[k, n, states_tensor[k, n + 1]].unsqueeze(0)
                curr_states_seq = states_tensor[k]
                curr_transition[k] = self.transition_matrix[curr_states_seq[n + 1], curr_states_seq[n]].unsqueeze(0)
            score = score + curr_emission*curr_mask + curr_transition*curr_mask
        return score

    def denominator(self, features_rnn_compressed, mask_tensor):
        # features_rnn_compressed: batch x max_seq_len x states_num
        # mask_tensor: batch_num x max_seq_len
        batch_num, max_seq_len = mask_tensor.shape
        score = self.tensor_ensure_gpu(torch.zeros(batch_num, self.states_num, dtype=torch.float).fill_(-9999.0))
        score[:, self.sos_idx] = 0.
        for n in range(max_seq_len):
            curr_mask = mask_tensor[:, n].unsqueeze(-1).expand_as(score)
            curr_score = score.unsqueeze(1).expand(-1, *self.transition_matrix.size())
            curr_emission = features_rnn_compressed[:, n].unsqueeze(-1).expand_as(curr_score)
            curr_transition = self.transition_matrix.unsqueeze(0).expand_as(curr_score)
            #curr_score = torch.logsumexp(curr_score + curr_emission + curr_transition, dim=2)
            curr_score = log_sum_exp(curr_score + curr_emission + curr_transition)
            score = curr_score * curr_mask + score * (1 - curr_mask)
        #score = torch.logsumexp(score, dim=1)
        score = log_sum_exp(score)
        return score

    def decode_viterbi(self, features_rnn_compressed, mask_tensor):
        # features_rnn_compressed: batch x max_seq_len x states_num
        # mask_tensor: batch_num x max_seq_len
        batch_size, max_seq_len = mask_tensor.shape
        seq_len_list = [int(mask_tensor[k].sum().item()) for k in range(batch_size)]
        # Step 1. Calculate scores & backpointers
        score = self.tensor_ensure_gpu(torch.Tensor(batch_size, self.states_num).fill_(-9999.))
        score[:, self.sos_idx] = 0.0
        backpointers = self.tensor_ensure_gpu(torch.LongTensor(batch_size, max_seq_len, self.states_num))
        for n in range(max_seq_len):
            curr_emissions = features_rnn_compressed[:, n]
            curr_score = self.tensor_ensure_gpu(torch.Tensor(batch_size, self.states_num))
            curr_backpointers = self.tensor_ensure_gpu(torch.LongTensor(batch_size, self.states_num))
            for curr_state in range(self.states_num):
                T = self.transition_matrix[curr_state, :].unsqueeze(0).expand(batch_size, self.states_num)
                max_values, max_indices = torch.max(score + T, 1)
                curr_score[:, curr_state] = max_values
                curr_backpointers[:, curr_state] = max_indices
            curr_mask = mask_tensor[:, n].unsqueeze(1).expand(batch_size, self.states_num)
            score = score * (1 - curr_mask) + (curr_score + curr_emissions) * curr_mask
            backpointers[:, n, :] = curr_backpointers # shape: batch_size x max_seq_len x state_num
        best_score_batch, last_best_state_batch = torch.max(score, 1)
        # Step 2. Find the best path
        best_path_batch = [[state] for state in last_best_state_batch.tolist()]
        for k in range(batch_size):
            curr_best_state = last_best_state_batch[k]
            curr_seq_len = seq_len_list[k]
            for n in reversed(range(1, curr_seq_len)):
                curr_best_state = backpointers[k, n, curr_best_state].item()
                best_path_batch[k].insert(0, curr_best_state)
        return best_path_batch

## Model

In [19]:
class TaggerBase(nn.Module):
    """TaggerBase is an abstract class for tagger models. It implements the tagging functionality for
    different types of inputs (sequences of tokens, sequences of integer indices, tensors). Auxiliary class
    SequencesIndexer is used for input and output data formats conversions. Abstract method `forward` is used in order
    to make these predictions, it have to be implemented in ancestors."""
    def __init__(self,  word_seq_indexer, tag_seq_indexer, gpu, batch_size):
        super(TaggerBase, self).__init__()
        self.word_seq_indexer = word_seq_indexer
        self.tag_seq_indexer = tag_seq_indexer
        self.gpu = gpu
        self.batch_size = batch_size

    def tensor_ensure_gpu(self, tensor):
        if self.gpu >= 0:
            return tensor.cuda(device=self.gpu)
        else:
            return tensor

    def self_ensure_gpu(self):
        if self.gpu >= 0:
            self.cuda(device=self.gpu)
        else:
            self.cpu()

    def save_tagger(self, checkpoint_fn):
        self.cpu()
        torch.save(self, checkpoint_fn)
        self.self_ensure_gpu()

    def forward(self, *input):
        pass

    def predict_idx_from_words(self, word_sequences):
        self.eval()
        outputs_tensor = self.forward(word_sequences) # batch_size x num_class+1 x max_seq_len
        output_idx_sequences = list()
        for k in range(len(word_sequences)):
            idx_seq = list()
            for l in range(len(word_sequences[k])):
                curr_output = outputs_tensor[k, 1:, l] # ignore the first component of output
                max_no = curr_output.argmax(dim=0)
                idx_seq.append(max_no.item() + 1)
            output_idx_sequences.append(idx_seq)
        return output_idx_sequences

    def predict_tags_from_words(self, word_sequences, batch_size=-1):
        if batch_size == -1:
            batch_size = self.batch_size
        print('\n')
        batch_num = math.floor(len(word_sequences) / batch_size)
        if len(word_sequences) > 0 and len(word_sequences) < batch_size:
            batch_num = 1
        output_tag_sequences = list()
        for n in range(batch_num):
            i = n*batch_size
            if n < batch_num - 1:
                j = (n + 1)*batch_size
            else:
                j = len(word_sequences)
            curr_output_idx = self.predict_idx_from_words(word_sequences[i:j])
            curr_output_tag_sequences = self.tag_seq_indexer.idx2items(curr_output_idx)
            output_tag_sequences.extend(curr_output_tag_sequences)
            print('\r++ predicting, batch %d/%d (%1.2f%%).' % (n + 1, batch_num, math.ceil(n * 100.0 / batch_num)),
                  end='', flush=True)
        return output_tag_sequences

    def get_mask_from_word_sequences(self, word_sequences):
        batch_num = len(word_sequences)
        max_seq_len = max([len(word_seq) for word_seq in word_sequences])
        mask_tensor = self.tensor_ensure_gpu(torch.zeros(batch_num, max_seq_len, dtype=torch.float))
        for k, word_seq in enumerate(word_sequences):
            mask_tensor[k, :len(word_seq)] = 1
        return mask_tensor # batch_size x max_seq_len

    def apply_mask(self, input_tensor, mask_tensor):
        input_tensor = self.tensor_ensure_gpu(input_tensor)
        mask_tensor = self.tensor_ensure_gpu(mask_tensor)
        return input_tensor*mask_tensor.unsqueeze(-1).expand_as(input_tensor)

In [20]:
class TaggerBiRNNCNNCRF(TaggerBase):
    """TaggerBiRNNCNNCRF is a model for sequences tagging that includes recurrent network + conv layer + CRF."""
    def __init__(self, word_seq_indexer, tag_seq_indexer, class_num, batch_size=1, rnn_hidden_dim=100,
                 emb_dim = 200, freeze_word_embeddings=False, dropout_ratio=0.5, rnn_type='GRU', gpu=-1,
                 freeze_char_embeddings = False, char_embeddings_dim=100, word_len=20, char_cnn_filter_num=30,
                 char_window_size=3):
        super(TaggerBiRNNCNNCRF, self).__init__(word_seq_indexer, tag_seq_indexer, gpu, batch_size)
        self.tag_seq_indexer = tag_seq_indexer
        self.class_num = class_num
        self.rnn_hidden_dim = rnn_hidden_dim
        self.emb_dim = emb_dim
        self.freeze_embeddings = freeze_word_embeddings
        self.dropout_ratio = dropout_ratio
        self.rnn_type = rnn_type
        self.gpu = gpu
        self.word_embeddings_layer = LayerWordEmbeddings(word_seq_indexer, gpu, freeze_word_embeddings)
        self.freeze_char_embeddings = freeze_char_embeddings
        self.char_embeddings_dim = char_embeddings_dim
        self.word_len = word_len
        self.char_cnn_filter_num = char_cnn_filter_num
        self.char_window_size = char_window_size
        self.word_embeddings_layer = LayerWordEmbeddings(word_seq_indexer, gpu, freeze_word_embeddings)
        self.char_embeddings_layer = LayerCharEmbeddings(gpu, char_embeddings_dim, freeze_char_embeddings,
                                                         word_len, word_seq_indexer.get_unique_characters_list())
        self.char_cnn_layer = LayerCharCNN(gpu, char_embeddings_dim, char_cnn_filter_num, char_window_size,
                                           word_len)
        self.char_lstm_layer = LayerCharBiLSTM(gpu,char_embeddings_dim,char_embeddings_dim)
        self.dropout = torch.nn.Dropout(p=dropout_ratio)

        if rnn_type == 'GRU':
            self.birnn_layer = LayerBiGRU(input_dim=self.emb_dim,
                                          hidden_dim=rnn_hidden_dim,
                                          gpu=gpu)
        elif rnn_type == 'LSTM':
            self.birnn_layer = LayerBiLSTM(input_dim=self.emb_dim,
                                           hidden_dim=rnn_hidden_dim,
                                           gpu=gpu)
        else:
            raise ValueError('Unknown rnn_type = %s, must be either "LSTM" or "GRU"')
        self.lin_layer1 = nn.Linear(in_features=self.word_embeddings_layer.output_dim + self.char_cnn_layer.output_dim + self.char_lstm_layer.output_dim, 
                                    out_features=self.emb_dim)
        self.att_layer = LayerAttention(gpu=gpu, hidden_dim=self.birnn_layer.output_dim)
        self.lin_layer2 = nn.Linear(in_features=self.birnn_layer.output_dim + self.att_layer.output_dim, out_features=class_num + 2)
        self.crf_layer = LayerCRF(gpu, states_num=class_num + 2, pad_idx=tag_seq_indexer.pad_idx, sos_idx=class_num + 1,
                                  tag_seq_indexer=tag_seq_indexer)
        self.softmax = nn.Softmax(dim=2)
        if gpu >= 0:
            self.cuda(device=self.gpu)

    def _forward_birnn(self, word_sequences):
        mask = self.get_mask_from_word_sequences(word_sequences)
        z_word_embed = self.word_embeddings_layer(word_sequences)
        z_word_embed_d = self.dropout(z_word_embed)
        z_char_embed = self.char_embeddings_layer(word_sequences)
        z_char_cnn = self.char_cnn_layer(z_char_embed)
        z_char_cnn_d = self.dropout(z_char_cnn)
        z_char_lstm = self.char_lstm_layer(z_char_embed)
        z_char_lstm_d = self.dropout(z_char_lstm)
        z = torch.cat((z_word_embed_d, z_char_cnn_d, z_char_lstm_d), dim=2)
        z = self.lin_layer1(z)
        rnn_output_h = self.apply_mask(self.birnn_layer(z, mask), mask)
        att_rnn_output = self.att_layer(rnn_output_h, mask)
        features_rnn_att = torch.cat((rnn_output_h, att_rnn_output), dim=2)
        features_rnn_compressed = self.lin_layer2(features_rnn_att)
        return self.apply_mask(features_rnn_compressed, mask)

    def get_loss(self, word_sequences_train_batch, tag_sequences_train_batch):
        targets_tensor_train_batch = self.tag_seq_indexer.items2tensor(tag_sequences_train_batch)
        features_rnn = self._forward_birnn(word_sequences_train_batch) # batch_num x max_seq_len x class_num
        mask = self.get_mask_from_word_sequences(word_sequences_train_batch)  # batch_num x max_seq_len
        numerator = self.crf_layer.numerator(features_rnn, targets_tensor_train_batch, mask)
        denominator = self.crf_layer.denominator(features_rnn, mask)
        nll_loss = -torch.mean(numerator - denominator)
        return nll_loss

    def predict_idx_from_words(self, word_sequences, no=-1):
        self.eval()
        features_rnn_compressed_masked  = self._forward_birnn(word_sequences)
        mask = self.get_mask_from_word_sequences(word_sequences)
        idx_sequences = self.crf_layer.decode_viterbi(features_rnn_compressed_masked, mask)
        return idx_sequences

    def predict_tags_from_words(self, word_sequences, batch_size=-1):
        if batch_size == -1:
            batch_size = self.batch_size
        print('\n')
        batch_num = math.floor(len(word_sequences) / batch_size)
        if len(word_sequences) > 0 and len(word_sequences) < batch_size:
            batch_num = 1
        output_tag_sequences = list()
        for n in range(batch_num):
            i = n*batch_size
            if n < batch_num - 1:
                j = (n + 1)*batch_size
            else:
                j = len(word_sequences)
            if batch_size == 1:
                curr_output_idx = self.predict_idx_from_words(word_sequences[i:j], n)
            else:
                curr_output_idx = self.predict_idx_from_words(word_sequences[i:j], -1)
            curr_output_tag_sequences = self.tag_seq_indexer.idx2items(curr_output_idx)
            output_tag_sequences.extend(curr_output_tag_sequences)
            print('\r++ predicting, batch %d/%d (%1.2f%%).' % (n + 1, batch_num, math.ceil(n * 100.0 / batch_num)),
                  end='', flush=True)
        return output_tag_sequences

## Evaluators

In [21]:
class EvaluatorBase():
    """EvaluatorBase is abstract base class for all evaluators"""
    def get_evaluation_score_train_dev_test(self, tagger, datasets_bank, batch_size=-1):
        if batch_size == -1:
            batch_size = tagger.batch_size
        score_train, _ = self.predict_evaluation_score(tagger=tagger,
                                                       word_sequences=datasets_bank.word_sequences_train,
                                                       targets_tag_sequences=datasets_bank.tag_sequences_train,
                                                       batch_size=batch_size)
        score_dev, _ = self.predict_evaluation_score(tagger=tagger,
                                                     word_sequences=datasets_bank.word_sequences_dev,
                                                     targets_tag_sequences=datasets_bank.tag_sequences_dev,
                                                     batch_size=batch_size)
        score_test, msg_test = self.predict_evaluation_score(tagger=tagger,
                                                             word_sequences=datasets_bank.word_sequences_test,
                                                             targets_tag_sequences=datasets_bank.tag_sequences_test,
                                                             batch_size=batch_size)
        return score_train, score_dev, score_test, msg_test

    def predict_evaluation_score(self, tagger, word_sequences, targets_tag_sequences, batch_size):
        outputs_tag_sequences = tagger.predict_tags_from_words(word_sequences, batch_size)
        return self.get_evaluation_score(targets_tag_sequences, outputs_tag_sequences, word_sequences)

In [22]:
class EvaluatorF1MacroTokenLevel(EvaluatorBase):
    def __init__(self):
        self.tag_list = None
        self.tag2idx = dict()

    def __init_tag_list(self, targets_tag_sequences):
        if self.tag_list is not None:
            return
        self.tag_list = list()
        for tag_seq in targets_tag_sequences:
            for t in tag_seq:
                if t not in self.tag_list:
                    self.tag_list.append(t)
                    self.tag2idx[t] = len(self.tag_list)
        self.tag_list.sort()

    def tag_seq_2_idx_list(self, tag_seq):
        return [self.tag2idx[t] for t in tag_seq]

    def __get_zeros_tag_dict(self):
        return {tag: 0 for tag in self.tag_list}

    def __add_dict(self, dict1, dict2):
        for tag in self.tag_list:
            dict1[tag] += dict2[tag]
        return dict1

    def __div_dict(self, dict, d):
        for tag in self.tag_list:
            dict[tag] /= d
        return dict

    def __get_M_F1_msg(self, F1):
        msg = '\nF1 scores\n'
        msg += '-' * 24 + '\n'
        sum_M_F1 = 0
        for tag in self.tag_list:
            sum_M_F1 += F1[tag]
            msg += '%15s = %1.2f\n' % (tag, F1[tag])
        M_F1 = sum_M_F1 / len(F1)
        msg += '-'*24 + '\n'
        msg += 'Macro-F1 = %1.3f' % M_F1
        return M_F1, msg

    def __add_to_dict(self, dict_in, tag, val):
        if tag in dict_in:
            dict_in[tag] += val
        else:
            dict_in[tag] = val
        return dict_in

    """EvaluatorF1MacroTagComponents is macro-F1 scores evaluator for each class of BOI-like tags."""
    def get_evaluation_score(self, targets_tag_sequences, outputs_tag_sequences, word_sequences=None):
        # Create list of tags
        self.__init_tag_list(targets_tag_sequences)
        # Init values
        TP = self.__get_zeros_tag_dict()
        FP = self.__get_zeros_tag_dict()
        FN = self.__get_zeros_tag_dict()
        F1 = self.__get_zeros_tag_dict()
        for targets_seq, outputs_tag_seq in zip(targets_tag_sequences, outputs_tag_sequences):
            for t, o in zip(targets_seq, outputs_tag_seq):
                if t == o:
                    TP = self.__add_to_dict(TP, t, 1)
                else:
                    FN = self.__add_to_dict(FN, t, 1)
                    FP = self.__add_to_dict(FP, o, 1)
        # Calculate F1 for each tag
        for tag in self.tag_list:
            F1[tag] = (2 * TP[tag] / max(2 * TP[tag] + FP[tag] + FN[tag], 1)) * 100
        # Calculate Macro-F1 score and prepare the message
        M_F1, msg = self.__get_M_F1_msg(F1)
        print(msg)
        #self.validate_M_F1_scikitlearn( targets_tag_sequences, outputs_tag_sequences)
        return M_F1, msg

In [23]:
class EvaluatorAccuracyTokenLevel(EvaluatorBase):
    """EvaluatorAccuracyTokenLevel is token-level accuracy evaluator for each class of BOI-like tags."""
    def get_evaluation_score(self, targets_tag_sequences, outputs_tag_sequences, word_sequences=None):
        cnt = 0
        match = 0
        for target_seq, output_seq in zip(targets_tag_sequences, outputs_tag_sequences):
            for t, o in zip(target_seq, output_seq):
                cnt += 1
                if t == o:
                    match += 1
        acc = match*100.0/cnt
        msg = '*** Token-level accuracy: %1.2f%% ***' % acc
        return acc, msg

## Report

In [24]:
class Report():
    def __init__(self, fn, args, score_names):
        """Report stores evaluation results during the training process as text files."""
        self.fn = fn
        self.args = args
        self.score_num = len(score_names)
        self.text = 'Evaluation\n\n'
        self.text += '\n'.join([hp for hp in str(args).replace('Namespace(', '').replace(')', '').split(', ')])
        header = '\n\n %14s |' % 'epoch '
        for n, score_name in enumerate(score_names):
            header += ' %14s ' % score_name
            if n < len(score_names) - 1: header += '|'
        self.text += header
        self.blank_line = '\n' + '-' * len(header)
        self.text += self.blank_line

    def write_epoch_scores(self, epoch, scores):
        self.text += '\n %14s |' % ('%d'% epoch)
        for n, score in enumerate(scores):
            self.text += ' %14s ' % ('%1.2f' % score)
            if n < len(scores) - 1: self.text += '|'
        self.__save()

    def write_final_score(self, final_score_str):
        self.text += self.blank_line
        self.text += '\n%s' % final_score_str
        self.__save()

    def write_msg(self, msg):
        self.text += self.blank_line
        self.text += msg
        self.__save()

    def write_input_arguments(self):
        self.text += '\nInput arguments:\n%s' % get_input_arguments()
        self.__save()

    def write_final_line_score(self, final_score):
        self.text += '\n\n%1.4f' % final_score
        self.__save()

    def __save(self):
        if self.fn is not None:
            with open(self.fn, mode='w') as text_file:
                text_file.write(self.text)

    def make_print(self):
        print(self.text)

## Factories

In [25]:
class DataIOFactory():
    """DataIOFactory contains wrappers to create various data readers/writers."""
    @staticmethod
    def create(args):
        if args.data_io == 'ncbi_disease':
            return DataIONCBI(dataset_name = args.data_io, 
                              train_no = args.train_no, 
                              dev_no = args.dev_no,
                              test_no = args.test_no
                             )
        else:
            raise ValueError('Unknown DataIO %s.' % args.data_io)

In [26]:
class DatasetsBankFactory():
    """DatasetsBankFactory contains wrappers to create various datasets banks."""
    @staticmethod
    def create(args):
        if args.dataset_sort:
            datasets_bank = DatasetsBankSorted(verbose=True)
        else:
            datasets_bank = DatasetsBank(verbose=True)
        return datasets_bank

In [27]:
class TaggerFactory():
    """TaggerFactory contains wrappers to create various tagger models."""
    @staticmethod
    def load(checkpoint_fn, gpu=-1):
        if not os.path.isfile(checkpoint_fn):
            raise ValueError('Can''t find tagger in file "%s". Please, run the main script with non-empty \
                             "--save-best-path" param to create it.' % checkpoint_fn)
        tagger = torch.load(checkpoint_fn)
        tagger.gpu = gpu

        tagger.word_seq_indexer.gpu = gpu # hotfix
        tagger.tag_seq_indexer.gpu = gpu # hotfix
        if hasattr(tagger, 'char_embeddings_layer'):# very hot hotfix
            tagger.char_embeddings_layer.char_seq_indexer.gpu = gpu # hotfix
        tagger.self_ensure_gpu()
        return tagger


    @staticmethod
    def create(args, word_seq_indexer, tag_seq_indexer, tag_sequences_train):
        if args.model == 'BiRNNCNNCRF':
            tagger = TaggerBiRNNCNNCRF(word_seq_indexer=word_seq_indexer,
                                       tag_seq_indexer=tag_seq_indexer,
                                       class_num=tag_seq_indexer.get_class_num(),
                                       batch_size=args.batch_size,
                                       rnn_hidden_dim=args.rnn_hidden_dim,
                                       emb_dim=args.emb_dim,
                                       freeze_word_embeddings=args.freeze_word_embeddings,
                                       dropout_ratio=args.dropout_ratio,
                                       rnn_type=args.rnn_type,
                                       gpu=args.gpu,
                                       freeze_char_embeddings=args.freeze_char_embeddings,
                                       char_embeddings_dim=args.char_embeddings_dim,
                                       word_len=args.word_len,
                                       char_cnn_filter_num=args.char_cnn_filter_num,
                                       char_window_size=args.char_window_size)
            tagger.crf_layer.init_transition_matrix_empirical(tag_sequences_train)
        else:
            raise ValueError('Unknown tagger model')
        return tagger

In [28]:
class EvaluatorFactory():
    """EvaluatorFactory contains wrappers to create various evaluators."""
    @staticmethod
    def create(args):
        if args.evaluator == 'f1-connl':
            return EvaluatorF1MicroSpansConnl()
        elif args.evaluator == 'f1-alpha-match-10':
            return EvaluatorF1MicroSpansAlphaMatch10()
        elif args.evaluator == 'f1-alpha-match-05':
            return EvaluatorF1MicroSpansAlphaMatch05()
        elif args.evaluator == 'f1-macro':
            return EvaluatorF1MacroTokenLevel()
        elif args.evaluator == 'f05-macro':
            return EvaluatorF05MacroTokenLevel()
        elif args.evaluator == 'token-acc':
            return EvaluatorAccuracyTokenLevel()
        else:
            raise ValueError('Unknown evaluator %s.' % args.evaluator)

In [29]:
class OptimizerFactory():
    """OptimizerFactory contains wrappers to create various optimizers."""
    @staticmethod
    def create(args, tagger):
        if args.opt == 'sgd':
            optimizer = optim.SGD(list(tagger.parameters()), lr=args.lr, momentum=args.momentum)
        elif args.opt == 'adam':
            optimizer = optim.Adam(list(tagger.parameters()), lr=args.lr, 
                                   betas=(0.9, 0.999),
                                   weight_decay = args.weight_decay
                                  )
        else:
            raise ValueError('Unknown optimizer, must be one of "sgd"/"adam".')
        scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: args.lr_decay ** epoch)
        return optimizer, scheduler

## Main

In [30]:
class ARGS:
    seed = 101
    verbose = True
    debug = False
    data_io = "ncbi_disease"
    train_no = None
    dev_no = None
    test_no = None
    model = 'BiRNNCNNCRF'
    rnn_type = 'LSTM'
    load = None
    epoch_num = 10
    min_epoch_num = 1
    batch_size = 16
    gpu = 0
    check_for_lowercase = True
    emb_fn = "/kaggle/input/glove6b/glove.6B.200d.txt"
    emb_dim = 200
    emb_delimiter = ' '
    emb_load_all = False
    freeze_word_embeddings = False
    rnn_hidden_dim = 200
    ## Character CNN config
    word_len = 20
    char_embeddings_dim = 100
    freeze_char_embeddings = False
    char_window_size = [4,3,2]
    char_cnn_filter_num =len(char_window_size)
    
    dropout_ratio = 0.5
    dataset_sort = False
    word_seq_indexer = None
    evaluator = 'f1-macro'
    opt = 'adam'
    lr = 0.001 # in paper
    lr_decay = 0.95
    weight_decay = 5e-4
    momentum = 0.95
    patience = 4 # in paper
    report_fn = '%s_report.txt' % get_datetime_str()
    clip_grad = 5
    save = '%s_tagger.hdf5' % get_datetime_str()
    save_best = True
    
args = ARGS()
print('> CONFIG DONE')

> CONFIG DONE


In [31]:
def set_seed(seed = args.seed):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    print('> SEEDING DONE')
set_seed()

> SEEDING DONE


In [32]:
def main():
    if args.gpu >= 0:
        torch.cuda.set_device(args.gpu)
    
    # Load text data as lists of lists of words (sequences) and corresponding list of lists of tags
    data_io = DataIOFactory.create(args)
    word_sequences_train, tag_sequences_train, word_sequences_dev, tag_sequences_dev, word_sequences_test, tag_sequences_test = data_io.read_train_dev_test(args)
    
    ## Dataset
    datasets_bank = DatasetsBankFactory.create(args)
    datasets_bank.add_train_sequences(word_sequences_train, tag_sequences_train)
    datasets_bank.add_dev_sequences(word_sequences_dev, tag_sequences_dev)
    datasets_bank.add_test_sequences(word_sequences_test, tag_sequences_test)
    
    # Word_seq_indexer converts lists of lists of words to lists of lists of integer indices and back
    word_seq_indexer = SeqIndexerWord(gpu=args.gpu, check_for_lowercase=args.check_for_lowercase,
                                      embeddings_dim=args.emb_dim, verbose=True)
    word_seq_indexer.load_items_from_embeddings_file_and_unique_words_list(emb_fn=args.emb_fn,
                                                                           emb_delimiter=args.emb_delimiter,
                                                                           emb_load_all=args.emb_load_all,
                                                                           unique_words_list=datasets_bank.unique_words_list)

    
    if args.word_seq_indexer is not None and not isfile(args.word_seq_indexer):
        torch.save(word_seq_indexer, args.word_seq_indexer)
    # Tag_seq_indexer converts lists of lists of tags to lists of lists of integer indices and back
    tag_seq_indexer = SeqIndexerTag(gpu=args.gpu)
    tag_seq_indexer.load_items_from_tag_sequences(tag_sequences_train)
    # Create or load pre-trained tagger
    if args.load is None:
        tagger = TaggerFactory.create(args, word_seq_indexer, tag_seq_indexer, tag_sequences_train)
    else:
        tagger = TaggerFactory.load(args.load, args.gpu)
    # Create evaluator
    evaluator = EvaluatorFactory.create(args)
    # Create optimizer
    optimizer, scheduler = OptimizerFactory.create(args, tagger)
    # Prepare report and temporary variables for "save best" strategy
    report = Report(args.report_fn, args, score_names=('train loss', '%s-train' % args.evaluator,
                                                       '%s-dev' % args.evaluator, '%s-test' % args.evaluator))
    # Initialize training variables
    iterations_num = floor(datasets_bank.train_data_num / args.batch_size)
    best_dev_score = -1
    best_epoch = -1
    best_test_score = -1
    best_test_msg = 'N\A'
    patience_counter = 0
    print('\nStart training...\n')
    for epoch in range(0, args.epoch_num + 1):
        time_start = time.time()
        loss_sum = 0
        if epoch > 0:
            tagger.train()
            if args.lr_decay > 0:
                scheduler.step()
            for i, (word_sequences_train_batch, tag_sequences_train_batch) in \
                    enumerate(datasets_bank.get_train_batches(args.batch_size)):
                tagger.train()
                tagger.zero_grad()
                loss = tagger.get_loss(word_sequences_train_batch, tag_sequences_train_batch)
                loss.backward()
                nn.utils.clip_grad_norm_(tagger.parameters(), args.clip_grad)
                optimizer.step()
                loss_sum += loss.item()
                if i % 1 == 0:
                    print('\r-- train epoch %d/%d, batch %d/%d (%1.2f%%), loss = %1.2f.' % (epoch, args.epoch_num,
                                                                                         i + 1, iterations_num,
                                                                                         ceil(i*100.0/iterations_num),
                                                                                         loss_sum*100 / iterations_num),
                                                                                         end='', flush=True)
        # Evaluate tagger
        train_score, dev_score, test_score, test_msg = evaluator.get_evaluation_score_train_dev_test(tagger,
                                                                                                     datasets_bank,
                                                                                                     batch_size=100)
        print('\n== eval epoch %d/%d "%s" train / dev / test | %1.2f / %1.2f / %1.2f.' % (epoch, args.epoch_num,
                                                                                        args.evaluator, train_score,
                                                                                        dev_score, test_score))
        report.write_epoch_scores(epoch, (loss_sum*100 / iterations_num, train_score, dev_score, test_score))
        # Early stopping
        if dev_score > best_dev_score:
            best_dev_score = dev_score
            best_test_score = test_score
            best_epoch = epoch
            best_test_msg = test_msg
            patience_counter = 0
            if args.save is not None and args.save_best:
                tagger.save_tagger(args.save)
            print('## [BEST epoch], %d seconds.\n' % (time.time() - time_start))
        else:
            patience_counter += 1
            print('## [no improvement micro-f1 on DEV during the last %d epochs (best_f1_dev=%1.2f), %d seconds].\n' %
                                                                                            (patience_counter,
                                                                                             best_dev_score,
                                                                                             (time.time()-time_start)))
        if patience_counter > args.patience and epoch > args.min_epoch_num:
            break
    # Save final trained tagger to disk, if it is not already saved according to "save best"
    if args.save is not None and not args.save_best:
        tagger.save_tagger(args.save)
    # Show and save the final scores
    if args.save_best:
        report.write_final_score('Final eval on test, "save best", best epoch on dev %d, %s, test = %1.2f)' %
                                 (best_epoch, args.evaluator, best_test_score))
        report.write_msg(best_test_msg)
        report.write_input_arguments()
        report.write_final_line_score(best_test_score)
    else:
        report.write_final_score('Final eval on test, %s test = %1.2f)' % (args.evaluator, test_score))
        report.write_msg(test_msg)
        report.write_input_arguments()
        report.write_final_line_score(test_score)
    if args.verbose:
        report.make_print()

In [33]:
if __name__ == "__main__":
    main()

Downloading builder script:   0%|          | 0.00/2.28k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading and preparing dataset ncbi_disease/ncbi_disease (download: 1.47 MiB, generated: 3.04 MiB, post-processed: Unknown size, total: 4.52 MiB) to /root/.cache/huggingface/datasets/ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/284k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/5433 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/924 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/941 [00:00<?, ? examples/s]

Dataset ncbi_disease downloaded and prepared to /root/.cache/huggingface/datasets/ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Loading from train: 5432 samples, 136086 words.
Loading from validation: 923 samples, 23969 words.
Loading from test: 940 samples, 24497 words.
DatasetsBank: len(unique_words_list) = 9284 unique words.
DatasetsBank: len(unique_words_list) = 10056 unique words.
DatasetsBank: len(unique_words_list) = 10818 unique words.
Reading embeddings file /kaggle/input/glove6b/glove.6B.200d.txt, line = 0
Reading embeddings file /kaggle/input/glove6b/glove.6B.200d.txt, line = 100000
Reading embeddings file /kaggle/input/glove6b/glove.6B.200d.txt, line = 200000
Reading embeddings file /kaggle/input/glove6b/glove.6B.200d.txt, line = 300000

load_vocabulary_from_embeddings_file_and_unique_words_list:
    First 50 OOV words:
        out_of_vocabulary_words_list[0] = APC2
        out_of_vocabulary_words_list[1] = 3beta
        out_of_vocabulary_words_list[2] = axin
        out_of_vocabulary_words_list[3] = conductin
        out_of_vocabulary_words_list[4] = betacatenin
        out_of_vocabulary_words_list