In [279]:
import warnings
import random
from typing import Dict, Union, Any, List, Tuple
import numpy as np
from numpy.core.multiarray import ndarray
from bpemb import BPEmb
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [205]:
tags = {
    "POI": 0,
    "StreetName": 1,
    "Others": 2,
    "EOS": 3,
}

## Tags Coverter

`TagsCoverter` is a utility class for converting between tag ID and tag name.

In [13]:
# https://github.com/GRAAL-Research/deepparse/blob/0951ffa18b0838fbd536d8d607695f1667d9939a/deepparse/converter/target_converter.py

class TagsConverter:
    """
    Class to define logic of tag to idx conversion and vice versa.
    Args:
        tags_to_idx (Dict): A dictionary where the keys are the tags (e.g. StreetNumber) and the values are
            the indexes (int) (e.g. 1).
    """

    def __init__(self, tags_to_idx: Dict) -> None:
        self.tags_to_idx = tags_to_idx
        self.idx_to_tags = {v: k for k, v in tags_to_idx.items()}

    def __call__(self, key: Union[str, int]) -> int:
        """
        If str convert from a tag to idx and if int convert from a idx to a tag using the convert table.
        """
        if isinstance(key, str):
            return self.tags_to_idx[key]
        return self.idx_to_tags[key]

In [17]:
tags_ids = TagsConverter(tags)

In [23]:
tags_ids('POI'), tags_ids(0)

(0, 'StreetName')

## Token -> Subword Embeddings

`BPEmb` is a way to convert between string to subword embeddings. In this model, we have $10^5$ subwords and each subword has an embedding of dimension 300.

In [38]:
emb_model = BPEmb(lang="multi", vs=100000, dim=300)

In [356]:
emb_model.encode("Hello ave fast")

['▁h', 'ello', '▁ave', '▁fast']

In [40]:
emb_model.encode_ids("hello")

[35, 3333]

In [52]:
emb_model.embed("hello").shape

(2, 300)

## Vectorizer

`BPEmbVectorizer` is a class for converting a list of addresses into nested array subword embeddings.

In [55]:
# https://github.com/GRAAL-Research/deepparse/blob/0951ffa18b0838fbd536d8d607695f1667d9939a/deepparse/vectorizer/bpemb_vectorizer.py#L9

class BPEmbVectorizer:
    """
    BPEmb vectorizer to convert an address into BPEmb embedding where each word is decomposed into subword units that
    are in turn embedded as a vector
    """

    def __init__(self, embeddings_model: Any) -> None:
        self.embeddings_model = embeddings_model
        self.padding_value = 0

    def __call__(self, addresses: List[str]) -> List[Tuple]:
        """
        Method to vectorizer addresses.
        Args:
            addresses (list[str]): The addresses to vectorize.
        Return:
            A tuple of the addresses elements (components) embedding vectosr and the word decomposition lengths.
        """
        self._max_length = 0
        batch = [self._vectorize_sequence(address) for address in addresses]
        self._decomposed_sequence_padding(batch)
        return batch

    def _vectorize_sequence(self, address: str) -> Tuple[List, List]:
        """
        Method to vectorize the address.
        Args:
            address (str): Address to vectorize using BPEmb.
        Return:
            A tuple of list of word vector and the word decomposition lengths.
        """
        input_sequence = []
        word_decomposition_lengths = []
        address = address.replace(",", "")  # see issue 56 https://github.com/GRAAL-Research/deepparse/issues/56
        for word in address.split():
            bpe_decomposition = self.embeddings_model.embed(word)
            word_decomposition_lengths.append(len(bpe_decomposition))
            input_sequence.append(list(bpe_decomposition))

        self._max_length = max(self._max_length, max(word_decomposition_lengths))

        return input_sequence, word_decomposition_lengths

    def _decomposed_sequence_padding(self, batch: List[Tuple]) -> None:
        """
        Method to add padding to the decomposed sequence.
        """
        for decomposed_sequence, _ in batch:
            for decomposition in decomposed_sequence:
                if len(decomposition) != self._max_length:
                    decomposition.extend([np.ones(self.embeddings_model.dim) * [self.padding_value]] *
                                         (self._max_length - len(decomposition)))

In [56]:
vectorizer = BPEmbVectorizer(embeddings_model=emb_model)

In [231]:
output = vectorizer(["Hello ave fast"])
# output[0] => "Hello ave"
# output[0][0] => embeddings
#     output[0][0][0] => Hello
#     output[0][0][1] => Ave
# output[0][1] => length

In [232]:
type(output)

list

In [234]:
output[0][1]

[2, 1, 1]

## Padding to torch Tensor

Note that different addresses may have different number of subwords. To handle this, we pad the tensor with zeros. `bpemb_data_padding` handles the padding and converts the nested array of subwords to `torch.Tensor` of padded subword embedding and its length.

In [236]:
# https://github.com/GRAAL-Research/deepparse/blob/0951ffa18b0838fbd536d8d607695f1667d9939a/deepparse/converter/data_padding.py#L36

def bpemb_data_padding(batch: List[Tuple], padding_value=-100) -> Tuple:
    """
    Function that add padding to the sequences and to the decomposition lengths so all can have the same length as
    the longest one.
    Args:
        batch (list[tuple]): The list of vectorize tupled batch data where the first element is the address embeddings
            and the second is the word decomposition lengths.
    Returns:
        A tuple (``x``, ``y``, ``z``). The element ``x`` is a tensor of padded word vectors, ``y`` is the padded
        decomposition lengths, and ``z`` is the original lengths of the sequences before padding.
    """

    sequences_vectors, decomp_len, lengths = zip(
        *[(torch.tensor(vectors), word_decomposition_len, len(vectors))
          for vectors, word_decomposition_len in sorted(batch, key=lambda x: len(x[0]), reverse=True)])

    lengths = torch.tensor(lengths)

    padded_sequences_vectors = pad_sequence(sequences_vectors, batch_first=True, padding_value=padding_value)

    # pad decomposition length
    max_sequence_length = lengths.max().item()
    for decomposition_length in decomp_len:
        if len(decomposition_length) < max_sequence_length:
            decomposition_length.extend([1] * (max_sequence_length - len(decomposition_length)))

    return padded_sequences_vectors, list(decomp_len), lengths

In [237]:
padded_output = bpemb_data_padding(output)

In [238]:
padded_output

(tensor([[[[ 0.1198, -0.0876, -0.3663,  ..., -0.1264,  0.0360,  0.3640],
           [ 0.3029, -0.0928, -0.3175,  ...,  0.5222, -0.1151,  0.2372]],
 
          [[ 0.2716, -0.3184,  0.4688,  ...,  0.5481,  0.2733, -0.5135],
           [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
 
          [[-0.0346, -0.1021, -0.7138,  ..., -0.5223, -0.0465, -0.0476],
           [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]]],
        dtype=torch.float64),
 [[2, 1, 1]],
 tensor([3]))

## Seq2Seq Model

Here is a simple model modified from deepparse. It takes a sequence of subword embeddings and outputs tagging probability for each subword.

For example, the string `"Hello ave fast"` has 4 subwords `['▁h', 'ello', '▁ave', '▁fast']`. The model outputs a tensor of shape `(n_subwords, 4)` indicating the logits of the particular subword having certain tag. Currently, we have 4 tags, as follows: POI, Street Name, Others, End-of-String.


In [357]:
# adapted from https://github.com/GRAAL-Research/deepparse/blob/0951ffa18b0838fbd536d8d607695f1667d9939a/deepparse/network/bpemb_seq2seq.py#L9

class Seq2SeqModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Embedding network
        self.embedding_input_size = 300
        self.embedding_hidden_size = 300
        self.embedding_num_layers = 1
        self.embedding = nn.LSTM(self.embedding_input_size, 
                                 self.embedding_hidden_size, 
                                 num_layers=self.embedding_num_layers,
                                 batch_first=True,
                                 bidirectional=True)

        self.embedding_projection_size = 300
        self.embdding_projection = nn.Linear(2 * self.embedding_hidden_size, self.embedding_projection_size)
        
        # Encoder
        self.encoder_input_size = 300
        self.encoder_hidden_size = 1024
        self.encoder_num_layers = 1
        self.encoder = nn.LSTM(self.encoder_input_size, 
                               self.encoder_hidden_size,
                               num_layers=self.encoder_num_layers, 
                               batch_first=True)
        
        # Decoder
        self.decoder_input_size = 1
        self.decoder_hidden_size = 1024
        self.decoder_num_layers = 1
        self.decoder = nn.LSTM(self.decoder_input_size, 
                               self.decoder_hidden_size,
                               num_layers=self.decoder_num_layers, 
                               batch_first=True)
        
        self.decoder_projection_output_size = 4
        self.decoder_projection = []
        self.decoder_projection.append(nn.Linear(self.decoder_hidden_size, self.decoder_projection_output_size))
        self.decoder_projection.append(nn.LogSoftmax(dim=1))
        self.decoder_projection = nn.Sequential(*self.decoder_projection)
        
    def forward(self, 
                to_predict: torch.Tensor, 
                decomposition_lengths: List, 
                lengths_tensor: torch.Tensor,
                target: Union[torch.Tensor, None] = None) -> torch.Tensor:
        device = to_predict.device
        batch_size = to_predict.size(0)
        
        #### Get embedded output
        embeddings = torch.zeros(to_predict.size(1), to_predict.size(0), to_predict.size(3)).to(device)
        to_predict = to_predict.transpose(0, 1).float()
        
        for i in range(to_predict.size(0)):
            lengths = []
            
            for decomposition_length in decomposition_lengths:
                lengths.append(decomposition_length[i])
            
            packed_sequence = pack_padded_sequence(to_predict[i], torch.tensor(lengths).cpu(), batch_first=True, enforce_sorted=False)
            packed_output, _ = self.embedding(packed_sequence)
            padded_output, padded_output_lengths = pad_packed_sequence(packed_output, batch_first=True)
            
            word_context = torch.zeros(padded_output.size(0), padded_output.size(2)).to(device)
            for j in range(batch_size):
                word_context[j] = padded_output[j, padded_output_lengths[j] - 1, :]
            
            projection_output = self.embdding_projection(word_context)
            
            embeddings[i] = projection_output
        
        embeddings = embeddings.transpose(0, 1)
                
        #### Encoder
        packed_sequence = pack_padded_sequence(embeddings, lengths_tensor.cpu(), batch_first=True, enforce_sorted=False)
        _, decoder_hidden = self.encoder(packed_sequence)
        decoder_input = torch.zeros(1, batch_size, 1).to(device).new_full((1, batch_size, 1), -1)
        
        #### Decoder
        max_length = lengths_tensor[0].item()
        prediction_sequence = torch.zeros(max_length + 1, batch_size, 4).to(device)
        decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
        decoder_output = self.decoder_projection(decoder_output[0])
        
        prediction_sequence[0] = decoder_output
        _, decoder_input = decoder_output.topk(1)
        
        if target is not None and random.random() < 0.5:
            target = target.transpose(0, 1)
            for idx in range(max_length):
                decoder_input = target[idx].view(1, batch_size, 1)
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                decoder_output = self.decoder_projection(decoder_output[0])
                prediction_sequence[idx + 1] = decoder_output
        else:
            for idx in range(max_length):
                decoder_output, decoder_hidden = self.decoder(decoder_input.view(1, batch_size, 1).float(), decoder_hidden)
                decoder_output = self.decoder_projection(decoder_output[0])
                prediction_sequence[idx + 1] = decoder_output
                _, decoder_input = decoder_output.topk(1)
                
        return prediction_sequence

In [362]:
emb_model = BPEmb(lang="multi", vs=100000, dim=300)
vectorizer = BPEmbVectorizer(embeddings_model=emb_model)
model = Seq2SeqModel()

In [363]:
output = vectorizer(["Hello ave fast"])
padded_output = bpemb_data_padding(output)

In [364]:
predictions = model(*padded_output)

In [365]:
tags_predictions = predictions.max(2)[1].transpose(0, 1).cpu().numpy()
tags_predictions_prob = torch.exp(predictions.max(2)[0]).transpose(0, 1).detach().cpu().numpy()

In [366]:
tags_predictions_prob

array([[0.2530992 , 0.2529205 , 0.25370014, 0.25450543]], dtype=float32)