## Libraries

In [1]:
import json
import os
from pprint import pprint
from sklearn.model_selection import train_test_split
from collections import Counter
import torch
import torch.utils.data as data

  from scipy.sparse import issparse


## Data 

In [2]:
def load_data(path):
    '''
        input: path/to/data
        output: json 
    '''
    dataset = []
    with open(path) as f:
        dataset = json.loads(f.read())
    return dataset

tmp_train_raw = load_data(os.path.join('../dataset','ATIS','train.json'))
test_raw = load_data(os.path.join('../dataset','ATIS','test.json'))
print('Train samples:', len(tmp_train_raw))
print('Test samples:', len(test_raw))
pprint(tmp_train_raw[0])

Train samples: 4978
Test samples: 893
{'intent': 'flight',
 'slots': 'O O O O O B-fromloc.city_name O B-depart_time.time '
          'I-depart_time.time O O O B-toloc.city_name O B-arrive_time.time O O '
          'B-arrive_time.period_of_day',
 'utterance': 'i want to fly from boston at 838 am and arrive in denver at '
              '1110 in the morning'}


In [3]:
portion = 0.10

intents = [x['intent'] for x in tmp_train_raw] # We stratify on intents
count_y = Counter(intents)

labels = []
inputs = []
mini_train = []

for id_y, y in enumerate(intents):
    if count_y[y] > 1: # If some intents occurs only once, we put them in training
        inputs.append(tmp_train_raw[id_y])
        labels.append(y)
    else:
        mini_train.append(tmp_train_raw[id_y])
# Random Stratify
X_train, X_dev, y_train, y_dev = train_test_split(inputs, labels, test_size=portion, 
                                                    random_state=42, 
                                                    shuffle=True,
                                                    stratify=labels)
X_train.extend(mini_train)
train_raw = X_train
dev_raw = X_dev

y_test = [x['intent'] for x in test_raw]

# # Intent distributions
# print('Train:')
# pprint({k:round(v/len(y_train),3)*100 for k, v in sorted(Counter(y_train).items())})
# print('Dev:'), 
# pprint({k:round(v/len(y_dev),3)*100 for k, v in sorted(Counter(y_dev).items())})
# print('Test:') 
# pprint({k:round(v/len(y_test),3)*100 for k, v in sorted(Counter(y_test).items())})
# print('='*89)
# Dataset size
print('TRAIN size:', len(train_raw))
print('DEV size:', len(dev_raw))
print('TEST size:', len(test_raw))

TRAIN size: 4480
DEV size: 498
TEST size: 893


In [4]:
train_raw

[{'utterance': 'what is the cost for these flights from baltimore to philadelphia',
  'slots': 'O O O O O O O O B-fromloc.city_name O B-toloc.city_name',
  'intent': 'airfare'},
 {'utterance': 'flights from westchester county to san francisco daily',
  'slots': 'O O B-fromloc.city_name I-fromloc.city_name O B-toloc.city_name I-toloc.city_name B-flight_days',
  'intent': 'flight'},
 {'utterance': 'i would like a flight from philadelphia to dallas on american airlines',
  'slots': 'O O O O O O B-fromloc.city_name O B-toloc.city_name O B-airline_name I-airline_name',
  'intent': 'flight'},
 {'utterance': 'can i take a single airline from la to charlotte to newark back to la',
  'slots': 'O O O O O O O B-fromloc.city_name O B-toloc.city_name O B-toloc.city_name O O B-fromloc.city_name',
  'intent': 'airline'},
 {'utterance': 'what is the earliest flight from oakland to washington dc on sunday',
  'slots': 'O O O B-flight_mod O O B-fromloc.city_name O B-toloc.city_name B-toloc.state_code O 

In [5]:
PAD_TOKEN = 0

# Count the number of unique words, slot and intent tags
slot_dict = set()
intent_dict = set()

for example in train_raw:
    for slot in example['slots'].split():
        if slot not in slot_dict:
            slot_dict.add(slot)
    if example['intent'] not in intent_dict:
        intent_dict.add(example['intent'])
        
for example in dev_raw:
    for slot in example['slots'].split():
        if slot not in slot_dict:
            slot_dict.add(slot)
    if example['intent'] not in intent_dict:
        intent_dict.add(example['intent'])
        
for example in test_raw:
    for slot in example['slots'].split():
        if slot not in slot_dict:
            slot_dict.add(slot)
    if example['intent'] not in intent_dict:
        intent_dict.add(example['intent'])

print('# Slots:', len(slot_dict))
print('# Intent:', len(intent_dict))

# Slots: 129
# Intent: 26


In [6]:
slot_dict

{'B-aircraft_code',
 'B-airline_code',
 'B-airline_name',
 'B-airport_code',
 'B-airport_name',
 'B-arrive_date.date_relative',
 'B-arrive_date.day_name',
 'B-arrive_date.day_number',
 'B-arrive_date.month_name',
 'B-arrive_date.today_relative',
 'B-arrive_time.end_time',
 'B-arrive_time.period_mod',
 'B-arrive_time.period_of_day',
 'B-arrive_time.start_time',
 'B-arrive_time.time',
 'B-arrive_time.time_relative',
 'B-booking_class',
 'B-city_name',
 'B-class_type',
 'B-compartment',
 'B-connect',
 'B-cost_relative',
 'B-day_name',
 'B-day_number',
 'B-days_code',
 'B-depart_date.date_relative',
 'B-depart_date.day_name',
 'B-depart_date.day_number',
 'B-depart_date.month_name',
 'B-depart_date.today_relative',
 'B-depart_date.year',
 'B-depart_time.end_time',
 'B-depart_time.period_mod',
 'B-depart_time.period_of_day',
 'B-depart_time.start_time',
 'B-depart_time.time',
 'B-depart_time.time_relative',
 'B-economy',
 'B-fare_amount',
 'B-fare_basis_code',
 'B-flight',
 'B-flight_days',

## Data Loading

In [7]:
class IntentsAndSlots(data.Dataset):
    def __init__(self, dataset, lang):
        self.utterances = []

In [8]:
a = [t['utterance'] for t in train_raw]
# a = set(a)
# a
a

['what is the cost for these flights from baltimore to philadelphia',
 'flights from westchester county to san francisco daily',
 'i would like a flight from philadelphia to dallas on american airlines',
 'can i take a single airline from la to charlotte to newark back to la',
 'what is the earliest flight from oakland to washington dc on sunday',
 "i 'd like to book a flight from columbus to nashville please",
 'i would like a flight from denver to pittsburgh',
 'show me round trip tickets from new york to miami',
 'flights from phoenix to las vegas',
 "i 'd like to find the cheapest fare from atlanta to dallas",
 "what 's the latest flight i can get from dallas to boston",
 'what is the earliest arrival in salt lake city of a flight from toronto',
 'list the nonstop flights from miami to new york on a sunday along with the fares that are less than 466 dollars',
 'all round trip flights between new york and miami coach fare',
 'show me the flights from denver to philadelphia on a satu

In [9]:
from transformers import BertTokenizer

a = [
    "what is the cost for these flights from baltimore to philadelphia",
    "flights from westchester county to san francisco daily",
]
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encoding = tokenizer(a, padding=True, truncation=True)

In [10]:
train_encoding

{'input_ids': [[101, 2054, 2003, 1996, 3465, 2005, 2122, 7599, 2013, 6222, 2000, 4407, 102], [101, 7599, 2013, 25489, 2221, 2000, 2624, 3799, 3679, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}

# sad

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# this is a simple way to fix the issue of BERT breaking words into subwords
# so the issue of sub-tokenization
# as suggested here: https://stackoverflow.com/questions/62082938/how-to-stop-bert-from-breaking-apart-specific-words-into-word-piece
tokenizer.add_tokens(list(slot_dict))

# Example dataset
data = [
    {'utterance': 'what is the cost for these flights from baltimore to philadelphia',
     'slots': 'O O O O O O O O B-fromloc.city_name O B-toloc.city_name',
     'intent': 'airfare'},
    {'utterance': 'flights from westchester county to san francisco daily',
     'slots': 'O O B-fromloc.city_name I-fromloc.city_name O B-toloc.city_name I-toloc.city_name B-flight_days',
     'intent': 'flight'},
    {'utterance': 'i would like a flight from philadelphia to dallas on american airlines',
     'slots': 'O O O O O O B-fromloc.city_name O B-toloc.city_name O B-airline_name I-airline_name',
     'intent': 'flight'},
    {'utterance': 'can i take a single airline from la to charlotte to newark back to la',
     'slots': 'O O O O O O O B-fromloc.city_name O B-toloc.city_name O B-toloc.city_name O O B-fromloc.city_name',
     'intent': 'airline'}
]

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        
        # self.intent_labels = {intent: i for i, intent in enumerate(set(item['intent'] for item in data))}
        # self.slot_labels = {slot: i for i, slot in enumerate(set(slot for item in data for slot in item['slots'].split()))}
        
    def __len__(self):
        return len(self.data)
    

    # TODO: modify the dataset so that it works correctly. With the added tokens the issues of sub-tokenization should be fixed, we'll see tomorrow.
    # https://huggingface.co/transformers/v3.4.0/custom_datasets.html
    def __getitem__(self, idx):
        item = self.data[idx]
        
        text = item['utterance']
        intent_label = item['intent']
        slot_labels = item['slots'].split()
        
        # Tokenize the text and align slot labels with tokens
        encoded_text = self.tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=64)
        encoded_intent = self.tokenizer(intent_label, return_tensors='pt')
        encoded_slots = self.tokenizer(slot_labels, return_tensors='pt', truncation=True, padding='max_length', max_length=64)
        
        print("Encoded text: ", encoded_text)
        print("Encoded intent: ", encoded_intent)
        print("Encoded slots: ", encoded_slots)

        return {
            'input_ids': encoded_text['input_ids'].squeeze(),
            'attention_mask': encoded_text['attention_mask'].squeeze(),
            'intent_label': encoded_intent['input_ids'].squeeze(), 
            'slot_labels': encoded_slots['input_ids'].squeeze()
        }

# Create dataset and dataloader
dataset = CustomDataset(train_raw, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Example usage
for batch in dataloader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    intent_label = batch['intent_label']
    slot_labels = batch['slot_labels']
    print("Input IDs:", input_ids)
    print("Attention Mask:", attention_mask)
    print("Intent Labels:", intent_label)
    print("Slot Labels:", slot_labels)

    print("Input tokens: ", tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0])))
    print("Intent labels untokenized: ", tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(intent_label)))
    # print("Slot labels untokenized: ", [tokenizer.convert_ids_to_tokens(slot) for slot in slot_labels])
    break


Encoded text:  {'input_ids': tensor([[  101,  2054,  7599,  1043,  1051, 10424,  1051,  1049,  4407,  1056,
          1051,  2624,  4557,  2278,  1051,  2007,  1037,  2358,  1051,  1052,
          1051,  2310,  2099,  1999,  5759,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0

RuntimeError: stack expects each tensor to be equal size, but got [13, 64] at entry 0 and [14, 64] at entry 1

In [None]:
asd = tokenizer.tokenize("B-fromloc.city_name")
print(asd)
# print(tokenizer.convert_ids_to_tokens(asd))
print(tokenizer.convert_tokens_to_ids(asd))
print(tokenizer.convert_tokens_to_string(asd))

['b-fromloc.city_name']
[30630]
b-fromloc.city_name


In [15]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Add custom tokens to the tokenizer
tokenizer.add_tokens(list(slot_dict))

# Example dataset
data = [
    {'utterance': 'what is the cost for these flights from baltimore to philadelphia',
     'slots': 'O O O O O O O O B-fromloc.city_name O B-toloc.city_name',
     'intent': 'airfare'},
    {'utterance': 'flights from westchester county to san francisco daily',
     'slots': 'O O B-fromloc.city_name I-fromloc.city_name O B-toloc.city_name I-toloc.city_name B-flight_days',
     'intent': 'flight'},
    {'utterance': 'i would like a flight from philadelphia to dallas on american airlines',
     'slots': 'O O O O O O B-fromloc.city_name O B-toloc.city_name O B-airline_name I-airline_name',
     'intent': 'flight'},
    {'utterance': 'can i take a single airline from la to charlotte to newark back to la',
     'slots': 'O O O O O O O B-fromloc.city_name O B-toloc.city_name O B-toloc.city_name O O B-fromloc.city_name',
     'intent': 'airline'}
]

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

        # Create label maps dynamically
        self.intent_labels = {intent: i for i, intent in enumerate(set(item['intent'] for item in data))}
        self.slot_labels = {slot: i for i, slot in enumerate(set(slot for item in data for slot in item['slots'].split()))}
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        text = item['utterance']
        intent_label = self.intent_labels[item['intent']]
        slot_labels = item['slots'].split()
        
        # Tokenize the text
        encoding = self.tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=64, is_split_into_words=False)
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        # Align slot labels with tokens
        word_ids = encoding.word_ids()
        slot_ids = [self.slot_labels[label] for label in slot_labels]
        full_slot_ids = [self.slot_labels['O']] * len(input_ids)
        
        # Fill in the slot labels for each token
        for i, word_id in enumerate(word_ids):
            if word_id is not None and word_id < len(slot_ids):
                full_slot_ids[i] = slot_ids[word_id]

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'intent_label': torch.tensor(intent_label, dtype=torch.long),
            'slot_labels': torch.tensor(full_slot_ids, dtype=torch.long)
        }

# Create dataset and dataloader
dataset = CustomDataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Example usage
for batch in dataloader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    intent_label = batch['intent_label']
    slot_labels = batch['slot_labels']
    
    print("Input IDs:", input_ids)
    print("Attention Mask:", attention_mask)
    print("Intent Labels:", intent_label)
    print("Slot Labels:", slot_labels)
    
    print("Input tokens: ", tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0].tolist())))
    break


ValueError: word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast` class).