In [1]:
from collections import Counter
import json
import os

from sklearn.model_selection import train_test_split

def load_data(path):
    dataset = []
    with open(path, "r") as file:
        dataset = json.loads(file.read())

    return dataset


def split_sets(tmp_train_raw):
    portion = 0.10

    intents = [x["intent"] for x in tmp_train_raw]  # We stratify on intents
    count_y = Counter(intents)

    labels = []
    inputs = []
    mini_train = []

    for id_y, y in enumerate(intents):
        if count_y[y] > 1:  # If some intents occurs only once, we put them in training
            inputs.append(tmp_train_raw[id_y])
            labels.append(y)
        else:
            mini_train.append(tmp_train_raw[id_y])

    # Random Stratify
    X_train, X_dev, _, _ = train_test_split(
        inputs,
        labels,
        test_size=portion,
        random_state=42,
        shuffle=True,
        stratify=labels,
    )
    X_train.extend(mini_train)
    train_raw = X_train
    dev_raw = X_dev

    return train_raw, dev_raw

def get_data(
    train=os.path.join("../dataset", "ATIS", "train.json"),
    test=os.path.join("../dataset", "ATIS", "test.json"),
):

    tmp_train_raw = load_data(train)
    test_raw = load_data(test)

    train_raw, dev_raw = split_sets(tmp_train_raw)

    return train_raw, dev_raw, test_raw

train_raw, dev_raw, test_raw = get_data()

  from scipy.sparse import issparse


In [3]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [56]:
tokenizer(phrase, truncation=True, padding="max_length", max_length=64)

{'input_ids': [[101, 2064, 1045, 2202, 1037, 2309, 8582, 2013, 2474, 2000, 5904, 2000, 12948, 2067, 2000, 2474, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2064, 1045, 2202, 1037, 2309, 8582, 2013, 2474, 2000, 5904, 2000, 12948, 2067, 2000, 2474, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,

In [47]:
bertId_to_seqId = {0:0}
seqId2bertId = {0:0}

print(len(seqId2bertId))

for phrases in [train_raw, dev_raw, test_raw]:
    for phrase in phrases:
        ids = tokenizer(phrase["utterance"])
        ids = ids["input_ids"]
        # print(ids)
        # break

        for id in ids:
            if id not in bertId_to_seqId:
                bertId_to_seqId[id] = len(bertId_to_seqId)+1
                seqId2bertId[len(seqId2bertId)+1] = id

        # print(phrase["utterance"])
        # phrase["tokens"] = tokenizer.tokenize(phrase["text"])

1


In [60]:
# try to encode a phrase
phrase = [train_raw[3]['utterance'],train_raw[3]['utterance']]
print(phrase)

tokens = tokenizer(phrase)['input_ids']
encoded_phrase = [[bertId_to_seqId[t] for t in ta] for ta in tokens]
encoded_phrase
    

['can i take a single airline from la to charlotte to newark back to la', 'can i take a single airline from la to charlotte to newark back to la']


[[2, 29, 20, 30, 23, 31, 32, 10, 33, 12, 34, 12, 35, 36, 12, 33, 14],
 [2, 29, 20, 30, 23, 31, 32, 10, 33, 12, 34, 12, 35, 36, 12, 33, 14]]

In [62]:
# decode a phrase

encoded_phrase = [2, 20, 116, 12, 198, 12, 17, 291, 10, 75, 356, 20, 184, 12, 189, 190, 70, 48, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

ids = [seqId2bertId[e] for e in encoded_phrase]
decoded = tokenizer.decode(ids) 
decoded

# tokens = [[seqId2bertId[t] for t in ta] for ta in encoded_phrase]
# tokens = tokenizer.decode(tokens[0])
# tokens

'[CLS] i evening to which to san looking from of mia i flying to an stop arrival denver [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [59]:
ids

[101,
 1045,
 3944,
 2000,
 2029,
 2000,
 2624,
 2559,
 2013,
 1997,
 8764,
 1045,
 3909,
 2000,
 2019,
 2644,
 5508,
 7573,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]