## Pre-Process datasets

In [14]:
import os

train_path = os.path.join("..", "dataset", "laptop14_train.txt")
test_path = os.path.join("..", "dataset", "laptop14_test.txt")

# [{'utterance': 'i want to fly from boston at 838 am and arrive in denver at 1110 in the morning',
#   'slots': 'O O O O O B-fromloc.city_name O B-depart_time.time I-depart_time.time O O O B-toloc.city_name O B-arrive_time.time O O B-arrive_time.period_of_day',
#   'intent': 'flight'},...]

def read_data(path):

    raw_data = []

    with open(path, encoding="utf-8", mode="r") as f:
        lines = f.readlines()
        
        error = 0
        
        for line in lines:
            _, tags = line.split("####")

            tags = tags.split(" ")

            all_body, all_tags = [], []
            for tag in tags:
                a = tag.split("=")

                # if `\n` is present in the tag, remove it
                if "\n" in a[1]:
                    a[1] = a[1].replace("\n", "")

                all_body.append(a[0])
                all_tags.append(a[1])

            if (len(all_body) != len(all_tags)):
                error += 1
                print(all_body)
                print(all_tags)

            all_body = " ".join(all_body)
            all_tags = " ".join(all_tags)

            raw_data.append({'utterance': all_body, 'slot':all_tags})

        print("Error: ", error)
        return raw_data
    
train_raw = read_data(train_path)
test_raw = read_data(test_path)

Error:  0
Error:  0


In [15]:
from collections import Counter
from sklearn.model_selection import train_test_split

def split_sets(tmp_train_raw):
    portion = 0.15

    intents = [x["slot"] for x in tmp_train_raw]  # We stratify on intents
    count_y = Counter(intents)

    labels = []
    inputs = []
    mini_train = []

    for id_y, y in enumerate(intents):
        if count_y[y] > 1:  # If some intents occurs only once, we put them in training
            inputs.append(tmp_train_raw[id_y])
            labels.append(y)
        else:
            mini_train.append(tmp_train_raw[id_y])

    # Random Stratify
    X_train, X_dev, _, _ = train_test_split(
        inputs,
        labels,
        test_size=portion,
        random_state=42,
        shuffle=True,
        stratify=labels,
    )
    X_train.extend(mini_train)
    train_raw = X_train
    dev_raw = X_dev

    return train_raw, dev_raw

train_raw, dev_raw = split_sets(train_raw)

In [16]:
slots_set = set()
for phrases in [train_raw, dev_raw, test_raw]:
    for phrase in phrases:
        for slot in phrase['slot'].split(" "):
            slots_set.add(slot)

slots2id = {"pad": 0}
id2slots = {0: "pad"}
for slot in slots_set:
    slots2id[slot] = len(slots2id)
    id2slots[len(id2slots)] = slot


In [20]:
import logging
from transformers import BertTokenizer, get_linear_schedule_with_warmup


def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    text_labels = text_labels.split()
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence.split(), text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)
        # labels.extend([label] + ["X"] * (n_subwords - 1))
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

def tokenize_data(raw_data, tokenizer):
    processed_data = []
    for dset in raw_data:
        tokenized_set = {}
        tokenized_set["raw_slots"] = dset["slot"]
        tokenized_set["raw_utterance"] = dset["utterance"]

        tokenized_sentence, adapted_labels = tokenize_and_preserve_labels(
            dset["utterance"], dset["slot"], tokenizer
        )

        tokenized_set["tokenized_utterance"] = tokenized_sentence
        tokenized_set["tokenized_slots"] = adapted_labels

        processed_data.append(tokenized_set)
    return processed_data

def encode_data(tokenized_data, tokenizer, slots2id):
    encoded_data = []
    for dset in tokenized_data:
        encoded_set = {}
        
        encoded_set["raw_slots"] = dset["raw_slots"]
        encoded_set["raw_utterance"] = dset["raw_utterance"]
        encoded_set["tokenized_utterance"] = dset["tokenized_utterance"]
        encoded_set["tokenized_slots"] = dset["tokenized_slots"]

        # Encode the tokenized utterance
        encoded_set["encoded_utterance"] = tokenizer.encode_plus(dset["tokenized_utterance"], add_special_tokens=False)

        # Encode the tokenized slots
        encoded_set["encoded_slots"] = [slots2id[slot] for slot in dset["tokenized_slots"]]

        encoded_data.append(encoded_set)
    return encoded_data

def check_preprocessing(encoded_data):
    err = 0
    for tokenized_data in encoded_data:
        if ((len(tokenized_data["encoded_utterance"]["input_ids"]) - len(tokenized_data["encoded_slots"])) != 0):
            err += 1

            logging.debug(tokenized_data["tokenized_utterance"])
            logging.debug(tokenized_data["tokenized_slots"])
            logging.debug(tokenized_data["encoded_utterance"])
            logging.debug(tokenized_data["encoded_slots"])            
            logging.debug("\n\n")
    
    if err != 0:
        logging.error("There are %d errors in the preprocessing", err)
        raise ValueError("There are errors in the preprocessing")

def preprocess_data(raw_data, tokenizer, slots2id):
    """
    Preprocess the raw data by tokenizing and encoding it.

    Args:
    - raw_data: list of dictionaries
    - tokenizer: BertTokenizer
    - slots2id: dictionary mapping slots to numerical ids
    - intent2id: dictionary mapping intents to numerical ids

    Note: the correctness of slots2id and intent2id is assumed.

    Each element in processed_train is a dictionary with the following keys:
    - raw_slots
    - raw_utterance
    - tokenized_utterance
    - tokenized_slots
    - encoded_utterance
    - encoded_slots
    """

    # Tokenize `utterance` and `slots` with sub-token labelling. The subtoken is labelled with the same label 
    processed_data = tokenize_data(raw_data, tokenizer)

    # Encode the tokenized data
    encoded_data = encode_data(processed_data, tokenizer, slots2id)

    # Check if the preprocessing is correct
    check_preprocessing(encoded_data)

    return encoded_data

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
processed_train = preprocess_data(train_raw, tokenizer, slots2id)
processed_test = preprocess_data(test_raw, tokenizer, slots2id)
processed_dev = preprocess_data(dev_raw, tokenizer, slots2id)

In [24]:
def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_fn(data):
    device = get_device()

    # Get the max length of the input sequence
    max_len = max([len(sentence) for sentence, _, _, _ in data])

    # PAD all the input sequences to the max length
    slots_len = torch.tensor([len(slots) for _, _, _, slots in data]).to(device)
    input_ids = torch.tensor([sentence + [0] * (max_len - len(sentence)) for sentence, _, _, _ in data]).to(device)
    attention_mask = torch.tensor([[1] * len(mask) + [0] * (max_len - len(mask)) for _, mask, _, _ in data]).to(device)
    token_type_ids = torch.tensor([token_type_ids + [0] * (max_len - len(token_type_ids)) for _, _, token_type_ids, _ in data]).to(device)
    slots = torch.tensor([slots + [0] * (max_len - len(slots)) for _, _, _, slots in data]).to(device)

    return {
        "slots_len": slots_len,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids,
        "slots": slots,
    }

In [25]:
import torch
class ATISDataset(torch.utils.data.Dataset):
    def __init__ (self, processed_data):
        self.input = [data["encoded_utterance"]["input_ids"] for data in processed_data]
        self.attention_mask = [data["encoded_utterance"]["attention_mask"] for data in processed_data]
        self.token_type_ids = [data["encoded_utterance"]["token_type_ids"] for data in processed_data]
        self.slots = [data["encoded_slots"] for data in processed_data]

    def __len__(self):
        return len(self.input)
    
    def __getitem__(self, idx):
        return self.input[idx], self.attention_mask[idx], self.token_type_ids[idx], self.slots[idx]
    
train_dataset = ATISDataset(processed_train)
test_dataset = ATISDataset(processed_test)
dev_dataset = ATISDataset(processed_dev)

dev_dataloader = torch.utils.data.DataLoader(
    dev_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn,
)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn,
)

In [26]:
for batch in dev_dataloader:
    print(batch)
    break

{'slots_len': tensor([17, 14, 19,  8, 11,  3, 21, 24, 22, 30, 13, 23, 17, 12, 25,  6,  6, 19,
        19, 11, 20, 24, 13, 34,  6, 11, 15, 10, 23, 10, 13,  9],
       device='cuda:0'), 'input_ids': tensor([[1045, 2052, 5791,  ...,    0,    0,    0],
        [2296, 2309, 2028,  ...,    0,    0,    0],
        [2065, 2069, 3021,  ...,    0,    0,    0],
        ...,
        [1045, 2066, 2009,  ...,    0,    0,    0],
        [2009, 2003, 2092,  ...,    0,    0,    0],
        [1045, 2572, 2074,  ...,    0,    0,    0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
    