In [1]:
import json
import os
from pprint import pprint
from sklearn.model_selection import train_test_split
from collections import Counter
import torch
import torch.utils.data as data

  from scipy.sparse import issparse


In [2]:
def load_data(path):
    '''
        input: path/to/data
        output: json 
    '''
    dataset = []
    with open(path) as f:
        dataset = json.loads(f.read())
    return dataset

tmp_train_raw = load_data(os.path.join('../dataset','ATIS','train.json'))
test_raw = load_data(os.path.join('../dataset','ATIS','test.json'))
print('Train samples:', len(tmp_train_raw))
print('Test samples:', len(test_raw))
# pprint(tmp_train_raw[0])

portion = 0.10

intents = [x['intent'] for x in tmp_train_raw] # We stratify on intents
count_y = Counter(intents)

labels = []
inputs = []
mini_train = []

for id_y, y in enumerate(intents):
    if count_y[y] > 1: # If some intents occurs only once, we put them in training
        inputs.append(tmp_train_raw[id_y])
        labels.append(y)
    else:
        mini_train.append(tmp_train_raw[id_y])
# Random Stratify
X_train, X_dev, y_train, y_dev = train_test_split(inputs, labels, test_size=portion, 
                                                    random_state=42, 
                                                    shuffle=True,
                                                    stratify=labels)
X_train.extend(mini_train)
train_raw = X_train
dev_raw = X_dev

y_test = [x['intent'] for x in test_raw]

# # Intent distributions
# print('Train:')
# pprint({k:round(v/len(y_train),3)*100 for k, v in sorted(Counter(y_train).items())})
# print('Dev:'), 
# pprint({k:round(v/len(y_dev),3)*100 for k, v in sorted(Counter(y_dev).items())})
# print('Test:') 
# pprint({k:round(v/len(y_test),3)*100 for k, v in sorted(Counter(y_test).items())})
# print('='*89)
# Dataset size
print('TRAIN size:', len(train_raw))
print('DEV size:', len(dev_raw))
print('TEST size:', len(test_raw))

Train samples: 4978
Test samples: 893
TRAIN size: 4480
DEV size: 498
TEST size: 893


In [3]:
PAD_TOKEN = 0

# Count the number of unique words, slot and intent tags
words_set = set()
slot_set = set()
intent_set = set()

for example in train_raw:
    for word in example['utterance'].split():
        if word not in words_set:
            words_set.add(word)
    for slot in example['slots'].split():
        if slot not in slot_set:
            slot_set.add(slot)
    if example['intent'] not in intent_set:
        intent_set.add(example['intent'])
        
for example in dev_raw:
    # for word in example['utterance'].split():
    #     if word not in words_set:
    #         words_set.add(word)
    for slot in example['slots'].split():
        if slot not in slot_set:
            slot_set.add(slot)
    if example['intent'] not in intent_set:
        intent_set.add(example['intent'])
        
for example in test_raw:
    # for word in example['utterance'].split():
    #     if word not in words_set:
    #         words_set.add(word)
    for slot in example['slots'].split():
        if slot not in slot_set:
            slot_set.add(slot)
    if example['intent'] not in intent_set:
        intent_set.add(example['intent'])

print('# Words:', len(words_set))
print('# Slots:', len(slot_set))
print('# Intent:', len(intent_set))

# Words: 864
# Slots: 129
# Intent: 26


In [47]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# this is a simple way to fix the issue of BERT breaking words into subwords
# so the issue of sub-tokenization
# as suggested here: https://stackoverflow.com/questions/62082938/how-to-stop-bert-from-breaking-apart-specific-words-into-word-piece
# tokenizer.add_tokens(list(slot_set))

# Example dataset
rdata = [
    {
        "utterance": "what is the cost for these flights from baltimore to philadelphia",
        "slots": "O O O O O O O O B-fromloc.city_name O B-toloc.city_name",
        "intent": "airfare",
    },
    {
        "utterance": "flights from westchester county to san francisco daily",
        "slots": "O O B-fromloc.city_name I-fromloc.city_name O B-toloc.city_name I-toloc.city_name B-flight_days",
        "intent": "flight",
    },
    {
        "utterance": "i would like a flight from philadelphia to dallas on american airlines",
        "slots": "O O O O O O B-fromloc.city_name O B-toloc.city_name O B-airline_name I-airline_name",
        "intent": "flight",
    },
    {
        "utterance": "can i take a single airline from la to charlotte to newark back to la",
        "slots": "O O O O O O O B-fromloc.city_name O B-toloc.city_name O B-toloc.city_name O O B-fromloc.city_name",
        "intent": "airline",
    },
]

# model = BertModel.from_pretrained('bert-base-uncased')

utterance = [t["utterance"] for t in rdata]
slots = [t["slots"] for t in rdata]
intent = [t["intent"] for t in rdata]

tokenized_utterance = tokenizer(utterance, return_tensors='pt', truncation=True, padding='max_length', max_length=64)
model = BertModel.from_pretrained('bert-base-uncased')
output = model(**tokenized_utterance)

# var.pooler_output # intent
# var.last_hidden_state # slot

# tokenizer.convert_ids_to_tokens(torch.argmax(var.last_hidden_state, dim=1).tolist()[0])


# print("Input tokens: ", tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0])))
# print("Intent labels untokenized: ", tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(intent_label)))

'[CLS]'

In [121]:
output.pooler_output[0].shape

torch.Size([768])

In [122]:
output.last_hidden_state[0].shape

torch.Size([64, 768])

In [103]:
# torch.tensor(tokenizer(intent).input_ids[0])

In [62]:
# tokenizer.convert_ids_to_tokens([101, 1051, 1051, 1051, 1051, 1051, 1051, 1051, 1051, 1038, 1011, 2013, 4135, 2278, 1012, 2103, 1035, 2171, 1051, 1038, 1011, 2000, 4135, 2278, 1012, 2103, 1035, 2171, 102])

In [102]:
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# # tokenizer.add_tokens(list(slot_set))

# tokenized_utterance = tokenizer(utterance, return_tensors='pt', truncation=True, padding='max_length', max_length=64)
# tokenizer.convert_ids_to_tokens(tokenized_utterance['input_ids'][0]) 