In [3]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init

**Prepare data**

Words are modeled with **a character-based RNN with LSTM**, which produces two vectors.

The forward vector will have a representation of the character sequence from the left to the right.

The backward one will have the same in the reversed order.

Our insight is that this character-based LSTM captures the phonological structure of the word from its graphemes/characters.

These two vectors are concatenated together with the whole **word’s embedding**
(the embeddings could be pre-trained from larger corpora or trained jointly for the task).

The vector of these three elements will represent each word in the sequence. Then, for each word, there will be a **word-level LSTM**, which will produce an output for each word, with its right and left context information.

Finally, this output will go through a **CRF layer** to get the optimal output.


<img src="image_ref/BiRNN_before_CRF.png">

BiRNN-CRF structure concating with pre-trainned word embedding

**Util functions**

In [4]:
def init_embedding(input_embedding):
    """
    Initialize embedding
    """
    bias = np.sqrt(3.0 / input_embedding.size(1))
    nn.init.uniform_(input_embedding, -bias, bias)
    # torch.nn.init.uniform_(tensor, a=0, b=1)
    # fills the input Tensor with values drawn form the uniform distribution U(a, b)
    
def init_lstm(input_lstm):
    """
    Initialize lstm
    """
    for ind in range(0, input_lstm.num_layers):
        weight = eval('input_lstm.weight_ih_l' + str(ind))
        bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
        nn.init.uniform(weight, -bias, bias)
        weight = eval('input_lstm.weight_hh_l' + str(ind))
        bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
        nn.init.uniform(weight, -bias, bias)
    if input_lstm.bidirectional:  # add reverse params
        for ind in range(0, input_lstm.num_layers):
            weight = eval('input_lstm.weight_ih_l' + str(ind) + '_reverse')
            bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
            nn.init.uniform(weight, -bias, bias)
            weight = eval('input_lstm.weight_hh_l' + str(ind) + '_reverse')
            bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
            nn.init.uniform(weight, -bias, bias)

    if input_lstm.bias:
        for ind in range(0, input_lstm.num_layers):
            weight = eval('input_lstm.bias_ih_l' + str(ind))
            weight.data.zero_()
            weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
            weight = eval('input_lstm.bias_hh_l' + str(ind))
            weight.data.zero_()
            weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
        if input_lstm.bidirectional:
            for ind in range(0, input_lstm.num_layers):
                weight = eval('input_lstm.bias_ih_l' + str(ind) + '_reverse')
                weight.data.zero_()
                weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
                weight = eval('input_lstm.bias_hh_l' + str(ind) + '_reverse')
                weight.data.zero_()
                weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1


**Build the Model**

- for character embedding, initialize lookup table
- train them by BiRNN with LSTM or CNN

- and concatenate character-level representation with pre-trained word-level representation
- pre-word lookup-table <= initialize using word2vec like method.
- UNK embedding if not in lookup-table
- UNK embedding is trained with singletons with probability 0.5

- add drop-out to after concanate

- done building our embeddings

- use new LSTM-CRF model to generate the target sequences


In [6]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tage_to_ix, embedding_dim, hidden_dim, char_lstm_dim=25,
                      char_to_ix=None, pre_word_embeds=None, char_embedding_dim=25, use_gpu=False,
                      n_cap=None, cap_embedding_dim=None, use_crf=True, char_mode='CNN'):
        super(BiLSTM_CRF, self).__init__()
        self.use_gpu = use_gpu
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.n_cap = n_cap
        self.cap_embedding_dim = cap_embedding_dim
        self.use_crf = use_crf
        self.tagset_size = len(tag_to_ix)
        self.out_channels = char_lstm_dim  #  #of output channels
        self.char_mode = char_mode
        # char_mode: CNN or LSTM
        # char_lstm_dim: #of output channels
        # hidden_dim: #of hidden dimensions
        print('char_mode: %s, out_channels: %d, hidden_dim: %d, ' % (char_mode, char_lstm_dim, hidden_dim))
        
        if self.n_cap and self.cap_embedding_dim:
            self.cap_embeds = nn.Embedding(self.n_cap, self.cap_embedding_dim)
            # torch.nn.Embedding(num_embeddings, embedding_dim, ...)
            # lookup table that stores embeddings of a fixed dictionary and size
            # num_embeddings: size of the dictionary of embeddings
            # embedding_dim: the size of each embedding vector
            init_embedding(self.cap_embeds.weight)  # embeddings for cap initialize
        
        if char_embedding_dim is not None:
            self.char_lstm_dim = char_lstm_dim
            self.char_embeds = nn.Embedding(len(char_to_ix), char_embedding_dim)
            init_embedding(self.char_embeds.weight)
            # embeddings for chars initialize
            if self.char_mode == 'LSTM':
                self.char_lstm = nn.LSTM(char_embedding_dim, char_lstm_dim, num_layers=1, bidirectional=True)
                init_lstm(self.char_lstm)  # initialize LSTM
            if self.char_mode == 'CNN':
                self.char_cnn3 = nn.Conv2d(in_channels=1, out_channels=self.out_channels, kernel_size=(3, char_embedding_dim), padding=(2, 0))
        
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        if pre_word_embeds is not None:
            self.pre_word_embeds = True
            self.word_embeds.weight = nn.Parameter(torch.FloatTensor(pre_word_embeds))
            # torch.nn.Parmeter(data)
            # Tensor that is to be considered a module parameter
            # data: parameter tensor
            # requires_grad(bool, optional): Default: True
        else:
            self.pre_word_embeds = False
        
        self.dropout = nn.Dropout(0.5) # Dropout layer
        # torch.nn.Dropout(p=0.5, inplace=False)
        # p: probability of an element to be zeroed
        # inplace: if set to True, will do this operation in-place. Default: False
        
        # until here was making the embedding for words (character embedding + pre-word embedding)
        ######################################################
        # from here is the BiRNN-CRF model using the embeddings above
        
        if self.n_cap and self.cap_embedding_dim:
            if self.char_mode == 'LSTM':
                self.lstm = nn.LSTM(embedding_dim+char_lstm_dim*2+cap_embedding_dim, hidden_dim, bidirectional=True)
            if self.char_mode == 'CNN':
                self.lstm = nn.LSTM(embedding_dim+self.out_channels+cap_embedding_dim, hidden_dim, bidirectional=True)
        else:
            if self.char_mode == 'LSTM':
                self.lstm = nn.LSTM(embedding_dim+char_lstm_dim*2, hidden_dim, bidirectional=True)
            if self.char_mode == 'CNN':
                self.lstm = nn.LSTM(embedding_dim+self.out_channels, hidden_dim, bidirectional=True)
        init_lstm(self.lstm)
        self.hw_trans = nn.Linear(self.out_channels, self.out_channels)
        self.hw_gate = nn.Linear(self.out_channels, self.out_channels)
        self.h2_h1 = nn.Linear(hidden_dim*2, hidden_dim)
        self.tanh = nn.Tanh()
        self.hidden2tag = nn.Linear(hidden_dim*2, self.tagset_size)
        init_linear(self.h2_h1)
        init_linear(self.hidden2tag)
        init_linear(self.hw_gate)
        init_linear(self.hw_trans)
        
        if self.use_crf:
            sef.transitions = nn.Parameter(
                torch.zeros(self.tagset_size, self.target_size))
            self.transitions.data[tag_to_ix[START_TAG], :] = -10000
            self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
            