In [1]:
import warnings
import random
from ast import literal_eval
from typing import Dict, Union, Any, List, Tuple
import pandas as pd
import numpy as np
from numpy.core.multiarray import ndarray
from tqdm.notebook import tqdm
from bpemb import BPEmb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.tensorboard import SummaryWriter

In [2]:
TRAIN_CSV_PATH = './santi/deepparse_clean_only_train_split.csv'

In [3]:
tags = {
    "PointOfInterest": 0,
    "Street": 1,
    "Other": 2,
    "EOS": 3,
}

## Tags Coverter

`TagsCoverter` is a utility class for converting between tag ID and tag name.

In [4]:
# https://github.com/GRAAL-Research/deepparse/blob/0951ffa18b0838fbd536d8d607695f1667d9939a/deepparse/converter/target_converter.py

class TagsConverter:
    """
    Class to define logic of tag to idx conversion and vice versa.
    Args:
        tags_to_idx (Dict): A dictionary where the keys are the tags (e.g. StreetNumber) and the values are
            the indexes (int) (e.g. 1).
    """

    def __init__(self, tags_to_idx: Dict) -> None:
        self.tags_to_idx = tags_to_idx
        self.idx_to_tags = {v: k for k, v in tags_to_idx.items()}

    def __call__(self, key: Union[str, int]) -> int:
        """
        If str convert from a tag to idx and if int convert from a idx to a tag using the convert table.
        """
        if isinstance(key, str):
            return self.tags_to_idx[key]
        return self.idx_to_tags[key]

In [5]:
tags_converter = TagsConverter(tags)

In [6]:
tags_converter('PointOfInterest'), tags_converter(0)

(0, 'PointOfInterest')

## Token -> Subword Embeddings

`BPEmb` is a way to convert between string to subword embeddings. In this model, we have $10^5$ subwords and each subword has an embedding of dimension 300.

In [7]:
emb_model = BPEmb(lang="multi", vs=100000, dim=300)

In [8]:
emb_model.encode("Hello ave fast")

['▁h', 'ello', '▁ave', '▁fast']

In [9]:
emb_model.encode_ids("hello")

[35, 3333]

In [10]:
emb_model.embed("hello").shape

(2, 300)

## Vectorizer

`BPEmbVectorizer` is a class for converting a list of addresses into nested array subword embeddings.

In [11]:
# https://github.com/GRAAL-Research/deepparse/blob/0951ffa18b0838fbd536d8d607695f1667d9939a/deepparse/vectorizer/bpemb_vectorizer.py#L9

class BPEmbVectorizer:
    """
    BPEmb vectorizer to convert an address into BPEmb embedding where each word is decomposed into subword units that
    are in turn embedded as a vector
    """

    def __init__(self, embeddings_model: Any) -> None:
        self.embeddings_model = embeddings_model
        self.padding_value = 0

    def __call__(self, addresses: List[str]) -> List[Tuple]:
        """
        Method to vectorizer addresses.
        Args:
            addresses (list[str]): The addresses to vectorize.
        Return:
            A tuple of the addresses elements (components) embedding vectosr and the word decomposition lengths.
        """
        self._max_length = 0
        batch = [self._vectorize_sequence(address) for address in addresses]
        self._decomposed_sequence_padding(batch)
        return batch

    def _vectorize_sequence(self, address: str) -> Tuple[List, List]:
        """
        Method to vectorize the address.
        Args:
            address (str): Address to vectorize using BPEmb.
        Return:
            A tuple of list of word vector and the word decomposition lengths.
        """
        input_sequence = []
        word_decomposition_lengths = []
#         address = address.replace(",", "")  # see issue 56 https://github.com/GRAAL-Research/deepparse/issues/56
        for word in address.split():
            bpe_decomposition = self.embeddings_model.embed(word)
            word_decomposition_lengths.append(len(bpe_decomposition))
            input_sequence.append(list(bpe_decomposition))

        self._max_length = max(self._max_length, max(word_decomposition_lengths))

        return input_sequence, word_decomposition_lengths

    def _decomposed_sequence_padding(self, batch: List[Tuple]) -> None:
        """
        Method to add padding to the decomposed sequence.
        """
        for decomposed_sequence, _ in batch:
            for decomposition in decomposed_sequence:
                if len(decomposition) != self._max_length:
                    decomposition.extend([np.ones(self.embeddings_model.dim) * [self.padding_value]] *
                                         (self._max_length - len(decomposition)))

In [12]:
vectorizer = BPEmbVectorizer(embeddings_model=emb_model)

In [13]:
output = vectorizer(["Hello ave fast"])
# output[0] => "Hello ave"
# output[0][0] => embeddings
#     output[0][0][0] => Hello
#     output[0][0][1] => Ave
# output[0][1] => length

In [14]:
type(output)

list

In [15]:
output[0][1]

[2, 1, 1]

## Padding to torch Tensor

Note that different addresses may have different number of subwords. To handle this, we pad the tensor with zeros. `bpemb_data_padding` handles the padding and converts the nested array of subwords to `torch.Tensor` of padded subword embedding and its length.

In [16]:
# https://github.com/GRAAL-Research/deepparse/blob/0951ffa18b0838fbd536d8d607695f1667d9939a/deepparse/converter/data_padding.py#L36

def bpemb_data_padding(batch: List[Tuple], padding_value=-100) -> Tuple:
    """
    Function that add padding to the sequences and to the decomposition lengths so all can have the same length as
    the longest one.
    Args:
        batch (list[tuple]): The list of vectorize tupled batch data where the first element is the address embeddings
            and the second is the word decomposition lengths.
    Returns:
        A tuple (``x``, ``y``, ``z``). The element ``x`` is a tensor of padded word vectors, ``y`` is the padded
        decomposition lengths, and ``z`` is the original lengths of the sequences before padding.
    """

    sequences_vectors, decomp_len, lengths = zip(
        *[(torch.tensor(vectors), word_decomposition_len, len(vectors))
          for vectors, word_decomposition_len in sorted(batch, key=lambda x: len(x[0]), reverse=True)])

    lengths = torch.tensor(lengths)

    padded_sequences_vectors = pad_sequence(sequences_vectors, batch_first=True, padding_value=padding_value)

    # pad decomposition length
    max_sequence_length = lengths.max().item()
    for decomposition_length in decomp_len:
        if len(decomposition_length) < max_sequence_length:
            decomposition_length.extend([1] * (max_sequence_length - len(decomposition_length)))

    return padded_sequences_vectors, list(decomp_len), lengths

In [17]:
padded_output = bpemb_data_padding(output)

In [18]:
padded_output

(tensor([[[[ 0.1198, -0.0876, -0.3663,  ..., -0.1264,  0.0360,  0.3640],
           [ 0.3029, -0.0928, -0.3175,  ...,  0.5222, -0.1151,  0.2372]],
 
          [[ 0.2716, -0.3184,  0.4688,  ...,  0.5481,  0.2733, -0.5135],
           [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
 
          [[-0.0346, -0.1021, -0.7138,  ..., -0.5223, -0.0465, -0.0476],
           [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]]],
        dtype=torch.float64),
 [[2, 1, 1]],
 tensor([3]))

## Seq2Seq Model

Here is a simple model modified from deepparse. It takes a sequence of subword embeddings and outputs tagging probability for each subword.

For example, the string `"Hello ave fast"` has 4 subwords `['▁h', 'ello', '▁ave', '▁fast']`. The model outputs a tensor of shape `(n_subwords, 4)` indicating the logits of the particular subword having certain tag. Currently, we have 4 tags, as follows: POI, Street Name, Others, End-of-String.


In [19]:
# adapted from https://github.com/GRAAL-Research/deepparse/blob/0951ffa18b0838fbd536d8d607695f1667d9939a/deepparse/network/bpemb_seq2seq.py#L9

class Seq2SeqModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Embedding network
        self.embedding_input_size = 300
        self.embedding_hidden_size = 300
        self.embedding_num_layers = 1
        self.embedding = nn.LSTM(self.embedding_input_size, 
                                 self.embedding_hidden_size, 
                                 num_layers=self.embedding_num_layers,
                                 batch_first=True,
                                 bidirectional=True)

        self.embedding_projection_size = 300
        self.embdding_projection = nn.Linear(2 * self.embedding_hidden_size, self.embedding_projection_size)
        
        # Encoder
        self.encoder_input_size = 300
        self.encoder_hidden_size = 1024
        self.encoder_num_layers = 1
        self.encoder = nn.LSTM(self.encoder_input_size, 
                               self.encoder_hidden_size,
                               num_layers=self.encoder_num_layers, 
                               batch_first=True)
        
        # Decoder
        self.decoder_input_size = 1
        self.decoder_hidden_size = 1024
        self.decoder_num_layers = 1
        self.decoder = nn.LSTM(self.decoder_input_size, 
                               self.decoder_hidden_size,
                               num_layers=self.decoder_num_layers)
        
        self.decoder_projection_output_size = 4
        self.decoder_projection = []
        self.decoder_projection.append(nn.Linear(self.decoder_hidden_size, self.decoder_projection_output_size))
        self.decoder_projection.append(nn.LogSoftmax(dim=1))
        self.decoder_projection = nn.Sequential(*self.decoder_projection)
        
    def forward(self, 
                to_predict: torch.Tensor, 
                decomposition_lengths: List, 
                lengths_tensor: torch.Tensor,
                target: Union[torch.Tensor, None] = None) -> torch.Tensor:
        device = to_predict.device
        batch_size = to_predict.size(0)
        
        #### Get embedded output
        embeddings = torch.zeros(to_predict.size(1), to_predict.size(0), to_predict.size(3)).to(device)
        to_predict = to_predict.transpose(0, 1).float()
        
        for i in range(to_predict.size(0)):
            lengths = []
            
            for decomposition_length in decomposition_lengths:
                lengths.append(decomposition_length[i])
            
            packed_sequence = pack_padded_sequence(to_predict[i], torch.tensor(lengths).cpu(), batch_first=True, enforce_sorted=False)
            packed_output, _ = self.embedding(packed_sequence)
            padded_output, padded_output_lengths = pad_packed_sequence(packed_output, batch_first=True)
            
            word_context = torch.zeros(padded_output.size(0), padded_output.size(2)).to(device)
            for j in range(batch_size):
                word_context[j] = padded_output[j, padded_output_lengths[j] - 1, :]
            
            projection_output = self.embdding_projection(word_context)
            
            embeddings[i] = projection_output
        
        embeddings = embeddings.transpose(0, 1)
                
        #### Encoder
        packed_sequence = pack_padded_sequence(embeddings, lengths_tensor.cpu(), batch_first=True, enforce_sorted=False)
        _, decoder_hidden = self.encoder(packed_sequence)
        
        #### Decoder
        decoder_input = torch.zeros(1, batch_size, 1).to(device).new_full((1, batch_size, 1), -1)
        max_length = lengths_tensor[0].item()
        prediction_sequence = torch.zeros(max_length + 1, batch_size, 4).to(device)
        decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
        decoder_output = self.decoder_projection(decoder_output[0])
        
        prediction_sequence[0] = decoder_output
        _, decoder_input = decoder_output.topk(1)
        
        if target is not None and random.random() < 0.5:
            target = target.transpose(0, 1)
            for idx in range(max_length):
                decoder_input = target[idx].view(1, batch_size, 1).float()
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                decoder_output = self.decoder_projection(decoder_output[0])
                prediction_sequence[idx + 1] = decoder_output
        else:
            for idx in range(max_length):
                decoder_output, decoder_hidden = self.decoder(decoder_input.view(1, batch_size, 1).float(), decoder_hidden)
                decoder_output = self.decoder_projection(decoder_output[0])
                prediction_sequence[idx + 1] = decoder_output
                _, decoder_input = decoder_output.topk(1)
                
        return prediction_sequence

In [20]:
addresses_to_parse = ["3 jersey road, un 28 nsw 2064", "fast ave", "hello ave"]

In [21]:
emb_model = BPEmb(lang="multi", vs=100000, dim=300)
vectorizer = BPEmbVectorizer(embeddings_model=emb_model)
model = Seq2SeqModel()

In [22]:
output = vectorizer(addresses_to_parse)
padded_output = bpemb_data_padding(output)

In [23]:
predictions = model(*padded_output)

In [24]:
tags_predictions = predictions.max(2)[1].transpose(0, 1).cpu().numpy()
tags_predictions_prob = torch.exp(predictions.max(2)[0]).transpose(0, 1).detach().cpu().numpy()

In [25]:
predictions.shape

torch.Size([8, 3, 4])

In [26]:
tags_predictions_prob

array([[0.2537793 , 0.25413087, 0.25421765, 0.2542254 , 0.2542219 ,
        0.2542237 , 0.25423124, 0.2542414 ],
       [0.2537815 , 0.25405163, 0.2541368 , 0.25416574, 0.2541837 ,
        0.2542014 , 0.25421923, 0.2542355 ],
       [0.25378394, 0.2540525 , 0.25413883, 0.2541687 , 0.25418675,
        0.25420406, 0.25422132, 0.25423703]], dtype=float32)

In [27]:
tagged_addresses_components = []
for address_to_parse, tags_prediction, tags_prediction_prob in zip(addresses_to_parse, tags_predictions,
                                                                   tags_predictions_prob):
    tagged_address_components = []
    print(address_to_parse)
    for word, predicted_idx_tag, tag_proba in zip(address_to_parse.split(), tags_prediction,
                                                  tags_prediction_prob):
        print("\t", word, predicted_idx_tag, tag_proba)
        tag = (tags_converter(predicted_idx_tag), tag_proba)
        tagged_address_components.append((word, tag))
    tagged_addresses_components.append(tagged_address_components)

3 jersey road, un 28 nsw 2064
	 3 1 0.2537793
	 jersey 1 0.25413087
	 road, 1 0.25421765
	 un 1 0.2542254
	 28 1 0.2542219
	 nsw 1 0.2542237
	 2064 1 0.25423124
fast ave
	 fast 1 0.2537815
	 ave 1 0.25405163
hello ave
	 hello 1 0.25378394
	 ave 1 0.2540525


In [28]:
tagged_addresses_components

[[('3', ('Street', 0.2537793)),
  ('jersey', ('Street', 0.25413087)),
  ('road,', ('Street', 0.25421765)),
  ('un', ('Street', 0.2542254)),
  ('28', ('Street', 0.2542219)),
  ('nsw', ('Street', 0.2542237)),
  ('2064', ('Street', 0.25423124))],
 [('fast', ('Street', 0.2537815)), ('ave', ('Street', 0.25405163))],
 [('hello', ('Street', 0.25378394)), ('ave', ('Street', 0.2540525))]]

### Dataset

In [50]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, csv_file=TRAIN_CSV_PATH):
        self.df = pd.read_csv(csv_file)
        self.df = self.df[self.df['labels'].notnull()]
        self.df['labels'] = self.df['labels'].apply(literal_eval)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        raw_address = self.df.iloc[idx]['sanitized_raw_address']
        POI, street = self.df.iloc[idx]['POI/street'].split('/')
        labels = self.df.iloc[idx]['labels']
        return raw_address, labels

In [51]:
dataset = Dataset()

Exception ignored in: <function tqdm.__del__ at 0x7fa070b691f0>
Traceback (most recent call last):
  File "/fast/Workspace/shopee-street/shopee-street/.env/lib/python3.8/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/fast/Workspace/shopee-street/shopee-street/.env/lib/python3.8/site-packages/tqdm/notebook.py", line 271, in close
    super(tqdm_notebook, self).close()
  File "/fast/Workspace/shopee-street/shopee-street/.env/lib/python3.8/site-packages/tqdm/std.py", line 1264, in close
    if self.disable:
AttributeError: 'tqdm_notebook' object has no attribute 'disable'
Exception ignored in: <function tqdm.__del__ at 0x7fa070b691f0>
Traceback (most recent call last):
  File "/fast/Workspace/shopee-street/shopee-street/.env/lib/python3.8/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/fast/Workspace/shopee-street/shopee-street/.env/lib/python3.8/site-packages/tqdm/notebook.py", line 271, in close
    super(tqdm_notebook, self).clos

In [52]:
dataset.df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,id,raw_address,POI/street,sanitized_raw_address,is_clean,labels
0,0,0,4941,4941,"hadi raya kre, no 29 balaraja",hadi/raya kre,"hadi raya kre , no 29 balaraja",True,"[PointOfInterest, Street, Street, Other, Other..."
2,2,2,115253,115253,"merak 11, no 10 cikarang utara",/merak 11,"merak 11 , no 10 cikarang utara",True,"[Street, Street, Other, Other, Other, Other, O..."
3,3,3,299321,299321,hotel fairmont jl asia afrika 8 gelora bung karno,bung karno/jl asia afrika,hotel fairmont jl asia afrika 8 gelora bung karno,True,"[Other, Other, Street, Street, Street, Other, ..."
4,4,4,173570,173570,"kar anyar d, 4 karang anyar rt 15 1 sawah besar",/kar anyar d,"kar anyar d , 4 karang anyar rt 15 1 sawah besar",True,"[Street, Street, Street, Other, Other, Other, ..."
5,5,5,30862,30862,roban gg. bakti 135 singkawang tengah,/gg. bakti,roban gg. bakti 135 singkawang tengah,True,"[Other, Street, Street, Other, Other, Other]"
...,...,...,...,...,...,...,...,...,...
239995,239995,239995,5274,5274,"cata xii,",/cata xii,"cata xii ,",True,"[Street, Street, Other]"
239996,239996,239996,161682,161682,nga jaya 27 1 pucang sewu gubeng,/nga jaya,nga jaya 27 1 pucang sewu gubeng,True,"[Street, Street, Other, Other, Other, Other, O..."
239997,239997,239997,28853,28853,taman ubud lest v no 21 binong curug,/taman ubud lest v,taman ubud lest v no 21 binong curug,True,"[Street, Street, Street, Street, Other, Other,..."
239998,239998,239998,298534,298534,raya riga tanjung ganti i kelam tengah,/raya riga,raya riga tanjung ganti i kelam tengah,True,"[Street, Street, Other, Other, Other, Other, O..."


## Test

In [53]:
test_addresses = [
    ["50 Hello ave", ["Other", "Other", "Other"]], 
    ["SS road", ["Other", "Other"]],
]

In [54]:
vectorizer = BPEmbVectorizer(embeddings_model=emb_model)
tags_vectorizer = TagsConverter(tags)

In [55]:
import sys
import numpy
import pprint
numpy.set_printoptions(threshold=100)

In [56]:
input_sequence = []
target_sequence = []

input_sequence.extend(vectorizer([address[0] for address in test_addresses]))
for address in test_addresses:
    target_tmp = [tags_vectorizer(target) for target in address[1]]
    target_tmp.append(tags_vectorizer("EOS"))
    target_sequence.append(target_tmp)

In [57]:
input_sequence

[([[array([-0.032396, -0.051103,  0.449281, ...,  0.354258, -0.630191,
            0.640046], dtype=float32),
    array([0., 0., 0., ..., 0., 0., 0.])],
   [array([ 0.119792, -0.087593, -0.366256, ..., -0.126352,  0.036049,
            0.363969], dtype=float32),
    array([ 0.302875, -0.092815, -0.317463, ...,  0.52216 , -0.115126,
            0.237173], dtype=float32)],
   [array([ 0.271623, -0.318372,  0.468792, ...,  0.548052,  0.273256,
           -0.513469], dtype=float32),
    array([0., 0., 0., ..., 0., 0., 0.])]],
  [1, 2, 1]),
 ([[array([-0.126823,  0.425704,  0.154299, ...,  0.366716,  0.07786 ,
            0.562716], dtype=float32),
    array([0., 0., 0., ..., 0., 0., 0.])],
   [array([-0.147449,  0.12949 , -0.107977, ...,  0.101064,  0.264715,
            0.098076], dtype=float32),
    array([0., 0., 0., ..., 0., 0., 0.])]],
  [1, 1])]

In [58]:
target_sequence

[[2, 2, 2, 3], [2, 2, 3]]

## Data Loader

In [67]:
def _convert_bpemb_sequence_to_tensor(batch):
    """
    Sort and convert a BPEmb sequence into a tensor with target element
    """
    sorted_batch = sorted(batch, key=lambda x: len(x[0][1]), reverse=True)
    return zip(*[(torch.tensor(vectors), word_decomposition_len, torch.tensor(target_vectors), len(vectors))
                 for (vectors, word_decomposition_len), target_vectors in sorted_batch])

def bpemb_data_padding_with_target(batch: List[Tuple], padding_value=-100) -> Tuple:
    """
    Function that add padding to the sequences and to the decomposition lengths so all can have the same length as
    the longest one.
    Args:
        batch (list[tuple]): The list of vectorize tupled batch data where the first element is the address embeddings
            and the second is the word decomposition lengths.
    Returns:
        A tuple ((``x``, ``y`` , ``z``), ``w``). The element ``x`` is a tensor of padded word vectors,
        ``y`` is the padded decomposition lengths, ``z`` is the original lengths of the sequences before padding, and
        ``w`` is a tensor of padded target idx.
    """

    sequences_vectors, decomp_len, target_vectors, lengths = _convert_bpemb_sequence_to_tensor(batch)

    lengths = torch.tensor(lengths)

    padded_sequences_vectors = pad_sequence(sequences_vectors, batch_first=True, padding_value=padding_value)
    padded_target_vectors = pad_sequence(target_vectors, batch_first=True, padding_value=padding_value)

    # pad decomposition length
    max_sequence_length = lengths.max().item()
    for decomposition_length in decomp_len:
        if len(decomposition_length) < max_sequence_length:
            decomposition_length.extend([1] * (max_sequence_length - len(decomposition_length)))

    return (padded_sequences_vectors, list(decomp_len), lengths), padded_target_vectors


In [68]:
from torch.utils.data import DataLoader

In [69]:
def collate_fn_train(batch_pairs):
    input_sequence = []
    target_sequence = []

    input_sequence.extend(vectorizer([address[0] for address in batch_pairs]))
    for address in batch_pairs:
        target_tmp = [tags_vectorizer(target) for target in address[1]]
        target_tmp.append(tags_vectorizer("EOS"))
        target_sequence.append(target_tmp)
        
    raw = [address[0] for address in batch_pairs]
    return raw, zip(input_sequence, target_sequence)

In [82]:
data_loader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn_train)

In [83]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [125]:
model = Seq2SeqModel().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
loss_fn = nn.NLLLoss().to(device)

In [126]:
writer = SummaryWriter()
counter = 0

In [None]:
for epoch in range(10):
    for idx, batch in enumerate(tqdm(data_loader)):
        raw, batch = batch
        padded_input, padded_target = bpemb_data_padding_with_target(batch)
        padded_input = (padded_input[0].to(device), 
                        padded_input[1], 
                        padded_input[2].to(device))
        padded_target = padded_target.to(device)
        model.zero_grad()

        predictions = model(*padded_input, padded_target)
        predictions = predictions.permute(1, 2, 0)
        loss = loss_fn(predictions, padded_target)

        loss.backward()
        optimizer.step()
        counter += 1

        writer.add_scalar("Loss/train", loss, counter)

        if counter % 100 == 0:
            TEST_IDX = 0
            sample_raw = raw[TEST_IDX].split()
            sample_pred = predictions[TEST_IDX].transpose(0,1).argmax(dim=1).detach().cpu().numpy()
            sample_pred_string = [tags_converter(x) for x in sample_pred]
            sample_target = padded_target[TEST_IDX].detach().cpu().numpy()
            sample_target_string = [tags_converter(x) for x in sample_target]
            compact = list(zip(sample_raw, sample_pred_string, sample_target_string))
            print(raw[TEST_IDX], compact)
            
    lr_scheduler.step()

  0%|          | 0/5903 [00:00<?, ?it/s]

pasar enjo lantai 2 no a.l01.bks.123 [('pasar', 'Other', 'Other'), ('enjo', 'Other', 'Street'), ('lantai', 'Other', 'Street'), ('2', 'Other', 'Street'), ('no', 'Other', 'Street'), ('a.l01.bks.123', 'Other', 'Other')]
mig ii 1 40534 cimahi selatan [('mig', 'Other', 'Other'), ('ii', 'Other', 'Other'), ('1', 'Other', 'Other'), ('40534', 'Other', 'Other'), ('cimahi', 'Other', 'Other'), ('selatan', 'Other', 'PointOfInterest')]
kedai mie kdl , kapas kram [('kedai', 'Other', 'PointOfInterest'), ('mie', 'Other', 'PointOfInterest'), ('kdl', 'Other', 'Other'), (',', 'Other', 'Other'), ('kapas', 'Other', 'Other'), ('kram', 'Other', 'Other')]
kademangan komp batan indah setu [('kademangan', 'Other', 'Other'), ('komp', 'Other', 'Other'), ('batan', 'Other', 'Street'), ('indah', 'Other', 'Street'), ('setu', 'Other', 'Street')]
lowokwaru kaliu 22 a lowokwaru [('lowokwaru', 'Other', 'Other'), ('kaliu', 'Other', 'Other'), ('22', 'Other', 'Other'), ('a', 'Other', 'PointOfInterest'), ('lowokwaru', 'Other'

## Test

In [91]:
test_data = pd.read_csv("./scl-2021-ds/valid_split.csv")

In [101]:
model = model.to('cpu')

In [115]:
n_correct = 0
n_correct_only_POI = 0
n_correct_only_street = 0
n_all = 0

In [123]:
pbar = tqdm(test_data.iterrows(), total=len(test_data))

  0%|          | 0/60000 [00:00<?, ?it/s]

In [124]:
for row in pbar:
    raw_address = row[1]['raw_address']
    target = row[1]['POI/street']
    emb = vectorizer([raw_address])
    padded = bpemb_data_padding(emb)
    pred = model(*padded)
    tags_predictions = pred.max(2)[1].transpose(0, 1).cpu().numpy()[0]
    
    tokens = raw_address.split()
    pred_POI = []
    pred_Street = []
    for (token, pred) in zip(tokens, tags_predictions):
        if pred == 0: 
            pred_POI.append(token)
        elif pred == 1:
            pred_Street.append(token)
            
    pred_POI = ' '.join(pred_POI)
    pred_Street = ' '.join(pred_Street)
    output = f'{pred_POI}/{pred_Street}'

    n_correct += (output == target)
    n_correct_only_POI += (pred_POI == target.split('/')[0])
    n_correct_only_street += (pred_Street == target.split('/')[1])
    n_all += 1
    
    acc = (n_correct / n_all)
    acc_POI = (n_correct_only_POI / n_all)
    acc_street = (n_correct_only_street / n_all)
    pbar.set_description(f"acc={acc:.3f} acc_POI={acc_POI:.3f} acc_street={acc_street:.3f}")

KeyboardInterrupt: 