### SNIPS NLU

#### credit -> https://colab.research.google.com/drive/1wgWdxUpKf3FWJgqA6ogBGDEzxAosjJMI

Snips NLU consists of 2 tasks (Slot Filling and Classification)

Slot filling can be formulated as NER

### Download Data

In [None]:
from urllib.request import urlretrieve
from pathlib import Path


SNIPS_DATA_BASE_URL = (
    "https://github.com/ogrisel/slot_filling_and_intent_detection_of_SLU/blob/"
    "master/data/snips/"
)
for filename in ["train", "valid", "test", "vocab.intent", "vocab.slot"]:
    path = Path(filename)
    if not path.exists():
        print(f"Downloading {filename}...")
        urlretrieve(SNIPS_DATA_BASE_URL + filename + "?raw=true", path)

In [1]:
# Read SNIPS data

import pandas as pd
import numpy as np
from pathlib import Path


def parse_line(line):
    utterance_data, intent_label = line.split(" <=> ")
    items = utterance_data.split()
    words = [item.rsplit(":", 1)[0] for item in items]
    word_labels = [item.rsplit(":", 1)[1] for item in items]
    return {
        "intent_label": intent_label,
        "words": " ".join(words),
        "word_labels": " ".join(word_labels),
        "length": len(words),
    }


lines_train = Path("train").read_text().strip().splitlines()
lines_valid = Path("valid").read_text().strip().splitlines()
lines_test = Path("test").read_text().strip().splitlines()

df_train = pd.DataFrame([parse_line(line) for line in lines_train])
df_valid = pd.DataFrame([parse_line(line) for line in lines_valid])
df_test = pd.DataFrame([parse_line(line) for line in lines_test])

# Slot labels
slot_names = ["[PAD]", "[EXTRA]"]
slot_names += Path("vocab.slot").read_text().strip().splitlines()
slot_map = {}
for label in slot_names:
    slot_map[label] = len(slot_map)

In [2]:
df_train

Unnamed: 0,intent_label,words,word_labels,length
0,AddToPlaylist,Add Don and Sherri to my Meditate to Sounds of...,O B-entity_name I-entity_name I-entity_name O ...,12
1,AddToPlaylist,put United Abominations onto my rare groove pl...,O B-entity_name I-entity_name O B-playlist_own...,8
2,AddToPlaylist,add the tune by misato watanabe to the Trapeo ...,O O B-music_item O B-artist I-artist O O B-pla...,10
3,AddToPlaylist,add this artist to my this is miguel bosé play...,O O B-music_item O B-playlist_owner B-playlist...,10
4,AddToPlaylist,add heresy and the hotel choir to the evening ...,O B-entity_name I-entity_name I-entity_name I-...,11
...,...,...,...,...
13079,SearchScreeningEvent,find a Consolidated Theatres showing The Good ...,O O B-location_name I-location_name O B-movie_...,10
13080,SearchScreeningEvent,where can i see animated movies in the neighbo...,O O O O B-movie_type I-movie_type B-spatial_re...,9
13081,SearchScreeningEvent,Showtimes for animated movies in the area .,O O B-movie_type I-movie_type B-spatial_relati...,8
13082,SearchScreeningEvent,Which animated movies are playing at Megaplex ...,O B-movie_type I-movie_type O O O B-location_n...,11


In [3]:
from transformers import AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
SPECIAL_PIECE = "▁"

In [25]:


def get_tokens_labels(
    aligned_words, orig_to_new_index, label_tokens, sub_words_mapped, label_pad_token="[PAD]"
):
    aligned_labels = [label_pad_token] * len(aligned_words)
    for original_pos, new_pos in enumerate(orig_to_new_index):
        aligned_labels[new_pos] = label_tokens[original_pos]
    flat_tokens = []
    flat_labels = []

    # The first word of the subword token is assigned entity
    # other tokens will be add PAD labels (we will mask it while training)
    assert (len(aligned_words) == len(sub_words_mapped) == len(aligned_labels))
    for (_align_word, _align_word, _align_label) in zip(
        aligned_words, sub_words_mapped, aligned_labels
    ):
        temp_w = []
        for _align_word in _align_word:
            temp_w.append(_align_word)
        temp_l = [label_pad_token] * len(temp_w)
        temp_l[0] = _align_label
        flat_tokens.extend(temp_w)
        flat_labels.extend(temp_l)
        
    return flat_tokens, flat_labels

In [29]:
from tf_transformers.utils import fast_sp_alignment
def tokenize_and_align_sentence_label(
    sentence, word_tokens, label_tokens, label_pad_token
):
    subwords = tokenizer.tokenize(sentence)
    orig_to_new_index, aligned_words, sub_words_mapped = fast_sp_alignment(
            sentence, tokenizer, SPECIAL_PIECE
        )
    
    flat_tokens, flat_labels = get_tokens_labels(aligned_words, orig_to_new_index, label_tokens, sub_words_mapped, label_pad_token
    )
    return flat_tokens, flat_labels

In [98]:
encoder_max_length = 50  # 50 is enough for SNIPS


def process_data_to_model_inputs(sentence, flat_labels, label_pad_token):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    result = {}
    result["input_ids"] = tokenizer.encode(
        sentence, truncation=True, max_length=encoder_max_length
    )
    result["input_mask"] = [1] * len(result["input_ids"])
    result["input_type_ids"] = [0] * len(result["input_ids"])
    labels = [slot_map[token] for token in flat_labels]
    labels = [slot_map[label_pad_token]] + labels + [slot_map[label_pad_token]]  # for [CLS] and [SEP]
    label_mask = []
    for token in flat_labels:
        if token == [label_pad_token]:
            label_mask.append(0)
            continue
        label_mask.append(1)
    label_mask = [0] + label_mask + [0]  # for [CLS] and [SEP]
    result["labels"] = labels
    result["label_mask"] = label_mask
    return result


ignored_index = []
def make_parse_fn(df):
    for index, row in df.iterrows():
        sentence = row["words"]
        labels = row["word_labels"]
        word_tokens = sentence.split()
        label_tokens = labels.split()
        if len(word_tokens) != len(label_tokens):
            ignored_index.append(index)
            continue
        flat_tokens, flat_labels = tokenize_and_align_sentence_label(
            sentence, word_tokens, label_tokens, label_pad_token="[PAD]"
        )
        yield process_data_to_model_inputs(sentence, flat_labels, label_pad_token)
    print("Ignored {} indexes".format(len(ignored_index)))


parse_fn = make_parse_fn(df_train)

In [38]:
# use TFProcessor only if your data is in range of 10k - 20k maximum
# otherwise use TFWriter
from tf_transformers.data import TFProcessor

tf_processor = TFProcessor()
train_dataset = tf_processor.process(parse_fn)

INFO:absl:Processed  10000 examples so far
INFO:absl:Total individual observations/examples written is 13073


Ignored 11 indexes


In [40]:
train_dataset.element_spec

{'input_ids': RaggedTensorSpec(TensorShape([None]), tf.int32, 0, tf.int64),
 'input_mask': RaggedTensorSpec(TensorShape([None]), tf.int32, 0, tf.int64),
 'input_type_ids': RaggedTensorSpec(TensorShape([None]), tf.int32, 0, tf.int64),
 'labels': RaggedTensorSpec(TensorShape([None]), tf.int32, 0, tf.int64),
 'label_mask': RaggedTensorSpec(TensorShape([None]), tf.int32, 0, tf.int64)}

In [45]:
import tensorflow as tf
from tf_transformers.data import separate_x_y
def pad_ragged(dataset):
    """
    Pad dataset of dict .

    """
    dataset_padded = {}
    for item, tensor in dataset.items():
        if isinstance(tensor, tf.RaggedTensor):
            dataset_padded[item] = tensor.to_tensor()
        else:
            dataset_padded[item] = tensor
    return dataset_padded

def auto_batch_for_training(tf_dataset, 
                            batch_size, 
                            x_keys = None,
                            y_keys = None, 
                            shuffle=False, 
                            drop_remainder=False, 
                            shuffle_buffer_size=10000, 
                            prefetch_buffer_size=100):
    element_spec = tf_dataset.element_spec
    dataset = tf_dataset.batch(batch_size, drop_remainder=drop_remainder)
    dataset = dataset.map(pad_ragged, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    if x_keys and y_keys:
        dataset = dataset.map(lambda x: separate_x_y(x, x_keys, y_keys),
                                num_parallel_calls=tf.data.experimental.AUTOTUNE)
    if shuffle:
        dataset = dataset.shuffle(
            shuffle_buffer_size, seed=None, reshuffle_each_iteration=True
        )
    dataset = dataset.prefetch(prefetch_buffer_size)
    return dataset

x_keys = ['input_ids', 'input_type_ids', 'input_mask']
y_keys = ['labels', 'label_mask']
train_dataset = auto_batch_for_training(train_dataset, batch_size=16, x_keys=x_keys, y_keys=y_keys)

In [46]:
for item in train_dataset:
    print(item)
    break

({'input_ids': <tf.Tensor: shape=(16, 17), dtype=int32, numpy=
array([[    2,  3547,   221,    17, 29533,    20,    51,  9488, 22443,
           20,  2795,    16,  1444, 27063,     3,     0,     0],
       [    2,   442,   181,    21,  1192, 15971,    18,  1204,    51,
         2890, 12489, 27063,     3,     0,     0,     0,     0],
       [    2,  3547,    14,  6768,    34,  2462,  6043, 29592,    20,
           14,  5585,  3894, 27063,     3,     0,     0,     0],
       [    2,  3547,    48,  1169,    20,    51,    48,    25,  8025,
        22629, 27063,     3,     0,     0,     0,     0,     0],
       [    2,  3547,   235,  4980,    17,    14,  1454,  4962,    20,
           14,  2089,  5402, 27063,     3,     0,     0,     0],
       [    2,  2247,  3547,   487,  8883,  3532,    20,    51, 27063,
           48,    25, 14241,     3,     0,     0,     0,     0],
       [    2,  3547,    40,   244,    20,    51,   968,   333,    55,
        10070,   232,    58,  1329,   814,     3, 

In [47]:
#### Load Albert Model

from tf_transformers.models import AlbertModel
model_layer, model, config = AlbertModel(model_name='albert_base_v2', 
                   is_training=True
                   )

INFO:absl:Initialized Variables


In [73]:
from tf_transformers.core import LegacyModel, LegacyLayer


class Token_Classification(LegacyLayer):
    def __init__(self, model, token_vocab_size, use_all_layers=False, activation="tanh", **kwargs):
        super(Token_Classification, self).__init__(**kwargs)
        self.model = model
        if isinstance(model, LegacyModel):
            self.model_config = model.model_config
        elif isinstance(model, tf.keras.layers.Layer):
            self.model_config = model._config_dict
        self.use_all_layers = use_all_layers
        self.logits_layer = tf.keras.layers.Dense(
            token_vocab_size,
            activation=activation,
            use_bias=True,
            kernel_initializer="glorot_uniform",
            bias_initializer="zeros",
        )

    def call(self, inputs):
        result = self.model(inputs)
        token_logits = []
        if self.use_all_layers:
            # each layer token embeddings
            for token_embeddings in result["all_layer_token_embeddings"]:
                outputs = self.logits_layer(token_embeddings)
                token_logits.append(outputs)
            return {'token_logits': token_logits}            

        else:
            # last layer token embeddings
            token_embeddings = result["token_embeddings"]
            outputs = self.logits_layer(token_embeddings)
            return {
                    "token_logits": outputs
            }
        
    def get_model(self):
        layer_output = self(self.model.input)
        model = LegacyModel(inputs=self.model.input, outputs=layer_output, name='span_selection')
        model.model_config = self.model_config
        return model
        

In [74]:
tf.keras.backend.clear_session()
token_classification_layer = Token_Classification(model=model,
                                      token_vocab_size=len(slot_map),
                                      use_all_layers=True, 
                                      is_training=True)
token_classification_model = token_classification_layer.get_model()

In [75]:
result = token_classification_model(item[0])

In [76]:
result

{'token_logits': [<tf.Tensor: shape=(16, 17, 74), dtype=float32, numpy=
  array([[[-0.6589864 , -0.7563687 ,  0.98634696, ...,  0.650301  ,
            0.6654792 , -0.90308124],
          [ 0.19518195, -0.98253757, -0.6353485 , ...,  0.94649124,
           -0.19243817,  0.84831876],
          [ 0.49345627, -0.5446298 ,  0.758992  , ...,  0.31135795,
            0.1970949 , -0.8132535 ],
          ...,
          [ 0.6537937 , -0.29818448,  0.9170768 , ...,  0.05026689,
           -0.8158943 ,  0.3864477 ],
          [ 0.9923766 , -0.2081895 , -0.47042075, ...,  0.77227235,
            0.9294651 ,  0.51044583],
          [ 0.905505  , -0.95786756,  0.91853935, ...,  0.52693677,
           -0.2729776 ,  0.20590636]],
  
         [[-0.70921683, -0.78513074,  0.9884197 , ...,  0.70700794,
            0.671792  , -0.9010645 ],
          [ 0.360376  , -0.9713132 ,  0.586849  , ...,  0.9923516 ,
            0.80661714,  0.32555136],
          [ 0.78514266, -0.9592638 ,  0.9210562 , ..., -0.881

In [65]:
tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
    item[1]["labels"], result["token_logits"]
) * tf.cast(item[1]["label_mask"], tf.float32)) / tf.reduce_sum(tf.cast(item[1]["label_mask"], tf.float32))

<tf.Tensor: shape=(), dtype=float32, numpy=4.501221>

In [66]:
from tf_transformers.losses import cross_entropy_loss_fast

In [84]:
def token_loss(y_true_dict, token_logits):
    loss = cross_entropy_loss_fast(
        labels=y_true_dict["labels"],
        logits=token_logits,
        label_weights=y_true_dict["label_mask"],
    )
    return loss

def token_loss_all_layers(y_true_dict, y_pred_dict):
    layer_loss = []
    for token_logits in y_pred_dict['token_logits']:
        loss = token_loss(y_true_dict, token_logits)
        layer_loss.append(loss)
    return tf.reduce_mean(layer_loss)

In [85]:
token_loss_all_layers(item[1], result)

<tf.Tensor: shape=(), dtype=float32, numpy=4.513224>

In [93]:
orig_to_new_index, aligned_words, sub_words_mapped = fast_sp_alignment(df_train['words'][0], 
                                                                        tokenizer, 
                                                                        SPECIAL_PIECE)

In [94]:
aligned_words

['Add',
 'Don',
 'and',
 'Sherri',
 'to',
 'my',
 'Meditate',
 'to',
 'Sounds',
 'of',
 'Nature',
 'playlist']

In [99]:
row = df_train.loc[0]

In [101]:
        sentence = row["words"]
        labels = row["word_labels"]
        word_tokens = sentence.split()
        label_tokens = labels.split()
#         if len(word_tokens) != len(label_tokens):
#             ignored_index.append(index)
#             continue
        flat_tokens, flat_labels = tokenize_and_align_sentence_label(
            sentence, word_tokens, label_tokens, label_pad_token="[PAD]"
        )

In [102]:
flat_tokens

['▁add',
 '▁don',
 '▁and',
 '▁sherri',
 '▁to',
 '▁my',
 '▁med',
 'itate',
 '▁to',
 '▁sounds',
 '▁of',
 '▁nature',
 '▁playlist']

In [103]:
flat_labels

['O',
 'B-entity_name',
 'I-entity_name',
 'I-entity_name',
 'O',
 'B-playlist_owner',
 'B-playlist',
 '[PAD]',
 'I-playlist',
 'I-playlist',
 'I-playlist',
 'I-playlist',
 'O']

In [106]:
sample_inputs = {}
sample_inputs['input_ids'] = tf.constant([tokenizer.convert_tokens_to_ids(flat_tokens)])
sample_inputs['input_mask'] = tf.ones_like(sample_inputs['input_ids'])
sample_inputs['input_type_ids'] = tf.zeros_like(sample_inputs['input_ids'])

In [109]:
result = token_classification_model(sample_inputs)
token_logits = result['token_logits'][-1]

In [114]:
tf.argmax(token_logits, axis=-1)

<tf.Tensor: shape=(1, 13), dtype=int64, numpy=array([[41, 40, 41, 41, 41, 41, 45,  6, 41, 41, 41, 41, 41]])>

In [118]:
start_index = 0
for item in sub_words_mapped:
    end_index = start_index + len(item)
    print(item, len(item), (start_index, end_index))
    start_index = end_index

['▁add'] 1 (0, 1)
['▁don'] 1 (1, 2)
['▁and'] 1 (2, 3)
['▁sherri'] 1 (3, 4)
['▁to'] 1 (4, 5)
['▁my'] 1 (5, 6)
['▁med', 'itate'] 2 (6, 8)
['▁to'] 1 (8, 9)
['▁sounds'] 1 (9, 10)
['▁of'] 1 (10, 11)
['▁nature'] 1 (11, 12)
['▁playlist'] 1 (12, 13)


In [None]:
def pack_like_this(input_list, output_list):
    
    start_index = 0
    for current_index, item in input_list:
        end_index = start_index + len(item)