In [3]:
import json
import logging
import os
from collections import Counter
from sklearn.model_selection import train_test_split


def load_data(path):
    dataset = []
    with open(path, "r") as file:
        dataset = json.loads(file.read())

    return dataset

def split_sets(tmp_train_raw):
    portion = 0.10

    intents = [x["intent"] for x in tmp_train_raw]  # We stratify on intents
    count_y = Counter(intents)

    labels = []
    inputs = []
    mini_train = []

    for id_y, y in enumerate(intents):
        if count_y[y] > 1:  # If some intents occurs only once, we put them in training
            inputs.append(tmp_train_raw[id_y])
            labels.append(y)
        else:
            mini_train.append(tmp_train_raw[id_y])

    # Random Stratify
    X_train, X_dev, _, _ = train_test_split(
        inputs,
        labels,
        test_size=portion,
        random_state=42,
        shuffle=True,
        stratify=labels,
    )
    X_train.extend(mini_train)
    train_raw = X_train
    dev_raw = X_dev

    return train_raw, dev_raw

def get_data(
    train=os.path.join("../dataset", "ATIS", "train.json"),
    test=os.path.join("../dataset", "ATIS", "test.json"),
):

    tmp_train_raw = load_data(train)
    test_raw = load_data(test)

    train_raw, dev_raw = split_sets(tmp_train_raw)

    logging.info("Train size: %d", len(train_raw))
    logging.info("Dev size: %d", len(dev_raw))
    logging.info("Test size: %d", len(test_raw))

    return train_raw, dev_raw, test_raw

train_raw, dev_raw, test_raw = get_data()

## Modify train_row, dev_raw and test_raw
So that it can be tokenized and put well.

In [39]:
slots_set = set()
intents_set = set()

for phrases in [train_raw, dev_raw, test_raw]:
    for phrase in phrases:
        for slot in phrase["slots"].split():
            slots_set.add(slot)
        intents_set.add(phrase["intent"])

slots2id = {"[PAD]": 0, "X": 1}
id2slots = {0: "[PAD]", 1: "X"}
for slot in (slots_set):
    slots2id[slot] = len(slots2id)
    id2slots[len(id2slots)] = slot

intent2id = {}
id2intent = {}
for intent in (intents_set):
    intent2id[intent] = len(intent2id)
    id2intent[len(id2intent)] = intent
    

In [34]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_and_preserve_labels(sentence, text_labels):
    text_labels = text_labels.split()
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence.split(), text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] + ["X"] * (n_subwords - 1))

    return tokenized_sentence, labels

def tokenize_data(raw_data):
    processed_data = []
    for dset in raw_data:
        tokenized_set = {}
        tokenized_set["raw_intent"] = dset["intent"]
        tokenized_set["raw_slots"] = dset["slots"]
        tokenized_set["raw_utterance"] = dset["utterance"]

        tokenized_sentence, adapted_labels = tokenize_and_preserve_labels(
            dset["utterance"], dset["slots"]
        )

        tokenized_set["tokenized_utterance"] = tokenized_sentence
        tokenized_set["tokenized_slots"] = adapted_labels

        processed_data.append(tokenized_set)
    return processed_data

def encode_data(tokenized_data):
    encoded_data = []
    for dset in tokenized_data:
        encoded_set = {}
        
        encoded_set["raw_intent"] = dset["raw_intent"]
        encoded_set["raw_slots"] = dset["raw_slots"]
        encoded_set["raw_utterance"] = dset["raw_utterance"]
        encoded_set["tokenized_utterance"] = dset["tokenized_utterance"]
        encoded_set["tokenized_slots"] = dset["tokenized_slots"]

        # Encode the tokenized utterance
        encoded_set["encoded_utterance"] = tokenizer.encode_plus(dset["tokenized_utterance"], add_special_tokens=False)

        # Encode the tokenized slots
        encoded_set["encoded_slots"] = [slots2id[slot] for slot in dset["tokenized_slots"]]

        # Encode the intent
        encoded_set["encoded_intent"] = intent2id[dset["raw_intent"]]

        encoded_data.append(encoded_set)
    return encoded_data

def preprocess_data(raw_data):
    # Tokenize `utterance` and `slots` with sub-token labelling. The subtoken is labelled with `X`.
    # The `X` label is used to indicate that the subtoken is not the first subtoken of a word.
    processed_data = tokenize_data(raw_data)

    # Encode the tokenized data
    encoded_data = encode_data(processed_data)

    return encoded_data
     


processed_train = preprocess_data(train_raw) 


In [38]:
err = 0
for tokenized_data in processed_train:
    if ((len(tokenized_data["encoded_utterance"]["input_ids"]) - len(tokenized_data["encoded_slots"])) != 0):
        err += 1
        # print(tokenized_data["raw_utterance"])
        # print(tokenized_data["raw_slots"])
        print(tokenized_data["tokenized_utterance"])
        print(tokenized_data["tokenized_slots"])
        print(tokenized_data["encoded_utterance"])
        print(tokenized_data["encoded_slots"])
        print(tokenized_data["encoded_intent"])
        
        print("\n\n")
        break
    # print(len(tokenized_data["tokenized_slots"]))

print(err)


0


In [45]:
print(len(processed_train[0]["tokenized_utterance"]))
print(len(processed_train[0]["tokenized_slots"]))
print(len(processed_train[0]["encoded_utterance"]["input_ids"]))
print(len(processed_train[0]["encoded_slots"]))
print((processed_train[0]["encoded_intent"]))

11
11
11
11
20


# TODO:
1. Create new torch.data.Dataset child with the new data
2. Create custom collate_fn for the `DataLoader`

Question: where to move data from to GPU? The easiest solution would be in the `__getitem__` of the `Dataset`, but this is not optimal (non batched operation, it probably requires gpu sync). If it's possible to do it inside the collate fn then it would be awesome.