In [1]:
import json
import logging
import os
from collections import Counter
from sklearn.model_selection import train_test_split


def load_data(path):
    dataset = []
    with open(path, "r") as file:
        dataset = json.loads(file.read())

    return dataset

def split_sets(tmp_train_raw):
    portion = 0.10

    intents = [x["intent"] for x in tmp_train_raw]  # We stratify on intents
    count_y = Counter(intents)

    labels = []
    inputs = []
    mini_train = []

    for id_y, y in enumerate(intents):
        if count_y[y] > 1:  # If some intents occurs only once, we put them in training
            inputs.append(tmp_train_raw[id_y])
            labels.append(y)
        else:
            mini_train.append(tmp_train_raw[id_y])

    # Random Stratify
    X_train, X_dev, _, _ = train_test_split(
        inputs,
        labels,
        test_size=portion,
        random_state=42,
        shuffle=True,
        stratify=labels,
    )
    X_train.extend(mini_train)
    train_raw = X_train
    dev_raw = X_dev

    return train_raw, dev_raw

def get_data(
    train=os.path.join("../dataset", "ATIS", "train.json"),
    test=os.path.join("../dataset", "ATIS", "test.json"),
):

    tmp_train_raw = load_data(train)
    test_raw = load_data(test)

    train_raw, dev_raw = split_sets(tmp_train_raw)

    logging.info("Train size: %d", len(train_raw))
    logging.info("Dev size: %d", len(dev_raw))
    logging.info("Test size: %d", len(test_raw))

    return train_raw, dev_raw, test_raw

train_raw, dev_raw, test_raw = get_data()

  from scipy.sparse import issparse


## Modify train_row, dev_raw and test_raw
So that it can be tokenized and put well.

In [2]:
slots_set = set()
intents_set = set()

for phrases in [train_raw, dev_raw, test_raw]:
    for phrase in phrases:
        for slot in phrase["slots"].split():
            slots_set.add(slot)
        intents_set.add(phrase["intent"])

slots2id = {"[PAD]": 0, "X": 1}
id2slots = {0: "[PAD]", 1: "X"}
for slot in (slots_set):
    slots2id[slot] = len(slots2id)
    id2slots[len(id2slots)] = slot

intent2id = {}
id2intent = {}
for intent in (intents_set):
    intent2id[intent] = len(intent2id)
    id2intent[len(id2intent)] = intent
    

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_and_preserve_labels(sentence, text_labels):
    text_labels = text_labels.split()
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence.split(), text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] + ["X"] * (n_subwords - 1))

    return tokenized_sentence, labels

def tokenize_data(raw_data):
    processed_data = []
    for dset in raw_data:
        tokenized_set = {}
        tokenized_set["raw_intent"] = dset["intent"]
        tokenized_set["raw_slots"] = dset["slots"]
        tokenized_set["raw_utterance"] = dset["utterance"]

        tokenized_sentence, adapted_labels = tokenize_and_preserve_labels(
            dset["utterance"], dset["slots"]
        )

        tokenized_set["tokenized_utterance"] = tokenized_sentence
        tokenized_set["tokenized_slots"] = adapted_labels

        processed_data.append(tokenized_set)
    return processed_data

def encode_data(tokenized_data):
    encoded_data = []
    for dset in tokenized_data:
        encoded_set = {}
        
        encoded_set["raw_intent"] = dset["raw_intent"]
        encoded_set["raw_slots"] = dset["raw_slots"]
        encoded_set["raw_utterance"] = dset["raw_utterance"]
        encoded_set["tokenized_utterance"] = dset["tokenized_utterance"]
        encoded_set["tokenized_slots"] = dset["tokenized_slots"]

        # Encode the tokenized utterance
        encoded_set["encoded_utterance"] = tokenizer.encode_plus(dset["tokenized_utterance"], add_special_tokens=False)

        # Encode the tokenized slots
        encoded_set["encoded_slots"] = [slots2id[slot] for slot in dset["tokenized_slots"]]

        # Encode the intent
        encoded_set["encoded_intent"] = intent2id[dset["raw_intent"]]

        encoded_data.append(encoded_set)
    return encoded_data

def preprocess_data(raw_data):
    # Tokenize `utterance` and `slots` with sub-token labelling. The subtoken is labelled with `X`.
    # The `X` label is used to indicate that the subtoken is not the first subtoken of a word.
    processed_data = tokenize_data(raw_data)

    # Encode the tokenized data
    encoded_data = encode_data(processed_data)

    return encoded_data
     


processed_train = preprocess_data(train_raw) 


In [4]:
err = 0
for tokenized_data in processed_train:
    if ((len(tokenized_data["encoded_utterance"]["input_ids"]) - len(tokenized_data["encoded_slots"])) != 0):
        err += 1
        # print(tokenized_data["raw_utterance"])
        # print(tokenized_data["raw_slots"])
        print(tokenized_data["tokenized_utterance"])
        print(tokenized_data["tokenized_slots"])
        print(tokenized_data["encoded_utterance"])
        print(tokenized_data["encoded_slots"])
        print(tokenized_data["encoded_intent"])
        
        print("\n\n")
        break
    # print(len(tokenized_data["tokenized_slots"]))

print(err)


0


In [5]:
print(len(processed_train[0]["tokenized_utterance"]))
print(len(processed_train[0]["tokenized_slots"]))
print(len(processed_train[0]["encoded_utterance"]["input_ids"]))
print(len(processed_train[0]["encoded_slots"]))
print((processed_train[0]["encoded_intent"]))

11
11
11
11
9


In [6]:
processed_train[0]

{'raw_intent': 'airfare',
 'raw_slots': 'O O O O O O O O B-fromloc.city_name O B-toloc.city_name',
 'raw_utterance': 'what is the cost for these flights from baltimore to philadelphia',
 'tokenized_utterance': ['what',
  'is',
  'the',
  'cost',
  'for',
  'these',
  'flights',
  'from',
  'baltimore',
  'to',
  'philadelphia'],
 'tokenized_slots': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-fromloc.city_name',
  'O',
  'B-toloc.city_name'],
 'encoded_utterance': {'input_ids': [2054, 2003, 1996, 3465, 2005, 2122, 7599, 2013, 6222, 2000, 4407], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 'encoded_slots': [13, 13, 13, 13, 13, 13, 13, 13, 68, 13, 64],
 'encoded_intent': 9}

# TODO:
1. Create new torch.data.Dataset child with the new data
2. Create custom collate_fn for the `DataLoader`

Question: where to move data from to GPU? The easiest solution would be in the `__getitem__` of the `Dataset`, but this is not optimal (non batched operation, it probably requires gpu sync). If it's possible to do it inside the collate fn then it would be awesome.

In [7]:
from torch.utils.data import Dataset, DataLoader

class ATISDataset(Dataset):
    def __init__ (self, processed_data):
        self.input = [data["encoded_utterance"]["input_ids"] for data in processed_data]
        self.attention_mask = [data["encoded_utterance"]["attention_mask"] for data in processed_data]
        self.token_type_ids = [data["encoded_utterance"]["token_type_ids"] for data in processed_data]
        self.slots = [data["encoded_slots"] for data in processed_data]
        self.intent = [data["encoded_intent"] for data in processed_data]

    def __len__(self):
        return len(self.input)
    
    def __getitem__(self, idx):
        return self.input[idx], self.attention_mask[idx], self.token_type_ids[idx], self.slots[idx], self.intent[idx]
    
train_dataset = ATISDataset(processed_train)

In [8]:
train_dataset[0]

([2054, 2003, 1996, 3465, 2005, 2122, 7599, 2013, 6222, 2000, 4407],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [13, 13, 13, 13, 13, 13, 13, 13, 68, 13, 64],
 9)

Note to yourself:

if you add some padding, it's zeroes everywhere.

([2054, 2003, 1996, 3465, 2005, 2122, 7599, 2013, 6222, 2000, 4407, 0, 0],  
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],  
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  
 [12, 12, 12, 12, 12, 12, 12, 12, 55, 12, 91, 0, 0],  
 14)

In [9]:
import torch

def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_rn(data):
    device = get_device()

    # Get the max length of the input sequence
    max_len = max([len(sentence) for sentence, _, _, _, _ in data])

    # PAD all the input sequences to the max length
    input_ids = torch.tensor([sentence + [0] * (max_len - len(sentence)) for sentence, _, _, _, _ in data]).to(device)
    attention_mask = torch.tensor([[1] * len(mask) + [0] * (max_len - len(mask)) for _, mask, _, _, _ in data]).to(device)
    token_type_ids = torch.tensor([token_type_ids + [0] * (max_len - len(token_type_ids)) for _, _, token_type_ids, _, _ in data]).to(device)
    slots = torch.tensor([slots + [0] * (max_len - len(slots)) for _, _, _, slots, _ in data]).to(device)
    intent = torch.tensor([intent for _, _, _, _, intent in data]).to(device)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids,
        "slots": slots,
        "intent": intent
    }

In [25]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_rn)
dev_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False, collate_fn=collate_rn)
test_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False, collate_fn=collate_rn)

In [26]:
for data in train_dataloader:
    print(data["input_ids"].shape)
    print(data["attention_mask"].shape)
    print(data["token_type_ids"].shape)
    print(data["slots"].shape)
    print(data["intent"].shape)
    
    break

torch.Size([32, 20])
torch.Size([32, 20])
torch.Size([32, 20])
torch.Size([32, 20])
torch.Size([32])


## Try to feed the model

In [27]:
from transformers import BertModel


class IntentSlotModel(torch.nn.Module):
    def __init__(self, slot_len, intent_len):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")

        self.intent_classifier = torch.nn.Linear(
            self.bert.config.hidden_size, intent_len
        )
        self.slot_classifier = torch.nn.Linear(self.bert.config.hidden_size, slot_len)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs.last_hidden_state
        pooled_output = outputs.pooler_output

        intent_logits = self.intent_classifier(pooled_output)
        slot_logits = self.slot_classifier(sequence_output)

        return intent_logits, slot_logits
    
model = IntentSlotModel(len(slots2id), len(intent2id))
model.to(get_device())

IntentSlotModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [28]:
for batch in train_dataloader:
    # input_ids, attention_mask, token_type_ids, slots, intent = batch
    
    # print(input_ids)
    # print(attention_mask)
    # print(token_type_ids)
    # print(slots)
    # print(intent)
    
    # print(input_ids)
    # print(attention_mask)
    # print(token_type_ids)
    # break

    intent_logits, slot_logits = model(batch["input_ids"], batch["attention_mask"], batch["token_type_ids"])
    
    print(intent_logits.shape)
    print(slot_logits.shape)

    print(torch.argmax(intent_logits, dim=1))
    print(torch.argmax(slot_logits, dim=2))

    break

torch.Size([32, 26])
torch.Size([32, 24, 131])
tensor([16, 12, 16, 16, 14, 16, 16, 16, 10, 16, 10, 16, 10, 16, 25, 16, 10, 12,
        10, 10, 10,  1, 12, 10, 16, 23, 16, 10, 25, 10, 23,  1],
       device='cuda:0')
tensor([[ 82,   3,  55,  21,  55,  55,  61,  55,  55,  92,  55,  55,   3,  54,
          55,  55, 115, 115, 115, 115, 115, 115, 115, 115],
        [ 20,  12,  94,  12,  94,  31,  31,  65,  31,  31,  31,   8,   8,   8,
           8,   8,   8,  20,   8,   8,   8,   8,   8,   8],
        [ 82,  55,  55,  31,  31, 120, 120, 120,  31,  35, 110,  82,  82,  82,
          81,  82,  81,  81,  81,  81,  81,  55,  81,  82],
        [ 78,   3,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  54,  55,
          55,  55,  55,   3,  55,  55,  55,  55,  55,  55],
        [110,  85,  85,  85,  31,  31,  31, 110, 110, 110, 110,  31,  31,  31,
          31,  31,  31, 110, 110, 110, 110, 110, 110, 110],
        [ 55, 129, 120,  31, 129,  12, 120,  29,  29,  65,  31,  65,  29,  29,
          

## Training loop

In [34]:
from sklearn.metrics import classification_report
from conll import evaluate

def calculate_loss(
    intent_loss_fn, slot_loss_fn, intent_logits, slot_logits, intent_labels, slot_labels
):
    intent_loss = intent_loss_fn(intent_logits, intent_labels)
    # intent_loss = 0
    slot_loss = slot_loss_fn(slot_logits.view(-1, len(slots2id)), slot_labels.view(-1))
    # slot_loss = 0
    return intent_loss + slot_loss

def eval_loop(
    model: IntentSlotModel,
    dataloader,
    intent_loss_fn,
    slot_loss_fn,
):

    model.eval()
    total_loss = []

    with torch.no_grad():
        for data in dataloader:
            # input_ids, attention_mask, intent_labels, slot_labels = data
            # intent_labels = intent_labels.squeeze(1)

            # Forward
            intent_logits, slot_logits = model(data["input_ids"], data["attention_mask"], data["token_type_ids"])
            total_loss.append(
                calculate_loss(
                    intent_loss_fn=intent_loss_fn,
                    intent_logits=intent_logits,
                    intent_labels=data["intent"],
                    slot_loss_fn=slot_loss_fn,
                    slot_logits=slot_logits,
                    slot_labels=data["slots"],
                )
            )

            intent_hyp = torch.argmax(intent_logits, dim=1)
            slot_hyp = torch.argmax(slot_logits, dim=2)

            print("========== EVAL ===========")
            # Intent: accuracy
            accuracy_intention = classification_report(
                    data["intent_labels"].to("cpu"),
                    intent_hyp.to("cpu"),
                    output_dict=True,
                    zero_division=False,
                )['accuracy']

            print(accuracy_intention)            

            # ref, hyp = [], []

            # input_ids = input_ids.to("cpu").tolist()
            # if slot_labels.shape != slot_hyp.shape and slot_labels.shape != input_ids.shape:
            #     print("Shape mismatch")
            #     print(slot_labels.shape)
            #     print(slot_hyp.shape)
            #     print(input_ids.shape)
            #     exit()

            # for input, s_ref, s_hyp in zip(input_ids, slot_labels, slot_hyp):
            #     tmp_ref = []
            #     tmp_hyp = []


            #     utterance = tokenizer.decode_utterance(input)
            #     # print(input)
            #     # print(utterance)
            #     # print(len(input))
            #     # print(len(utterance))
            #     # exit()

            #     for u, r,h in zip(utterance, s_ref, s_hyp):
            #         tmp_ref.append((u, f"{r}"))
            #         tmp_hyp.append((u, f"{h}"))

            #         print(f"Slot: {u} - Ref: {r} - Hyp: {h}")
            # break
        
        #         exit()
        #         ref.append(tmp_ref)
        #         hyp.append(tmp_hyp)
        
        # exit()

        # f1_slot = evaluate(ref, hyp)
        # print(f1_slot)

def train_loop(
    model: IntentSlotModel,
    data,
    optimizer,
    intent_loss_fn,
    slot_loss_fn,
):
    model.train()

    # input_ids, attention_mask, token_type_ids, slot_labels, intent_labels = data
    # intent_labels = intent_labels.squeeze(1)

    intent_logits, slot_logits = model(batch["input_ids"], batch["attention_mask"], batch["token_type_ids"])

    loss = calculate_loss(
        intent_loss_fn=intent_loss_fn,
        intent_logits=intent_logits,
        intent_labels=batch["intent"],
        slot_loss_fn=slot_loss_fn,
        slot_logits=slot_logits,
        slot_labels=batch["slots"],
    )

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
intent_loss_fn = torch.nn.CrossEntropyLoss()
slot_loss_fn = torch.nn.CrossEntropyLoss()

for index, batch in enumerate(train_dataloader):
    
    # input_ids, attention_mask, intent_labels, slot_labels = batch
    # intent_labels = intent_labels.squeeze(1)

    loss = train_loop(model, batch, optimizer, intent_loss_fn, slot_loss_fn)
    print(f"Step: {index} - Loss: {loss}")

    if (index % 50 == 0 and index != 0):
        # if (index % 5 == 0):
        eval_loop(model, dev_dataloader, intent_loss_fn, slot_loss_fn)

Step: 0 - Loss: 0.5163730382919312
Step: 1 - Loss: 0.3878338932991028
Step: 2 - Loss: 0.3836510181427002
Step: 3 - Loss: 0.5767920017242432
Step: 4 - Loss: 0.3523658215999603
Step: 5 - Loss: 0.16499820351600647
Step: 6 - Loss: 0.2956233322620392
Step: 7 - Loss: 0.3277897834777832
Step: 8 - Loss: 0.25844529271125793
Step: 9 - Loss: 0.33308225870132446
Step: 10 - Loss: 0.5483835339546204
Step: 11 - Loss: 0.6684406399726868
Step: 12 - Loss: 0.2237999439239502
Step: 13 - Loss: 0.42041563987731934
Step: 14 - Loss: 0.20923690497875214
Step: 15 - Loss: 0.763887882232666
Step: 16 - Loss: 0.30393359065055847
Step: 17 - Loss: 0.3117995858192444
Step: 18 - Loss: 0.5766531229019165
Step: 19 - Loss: 0.19068732857704163
Step: 20 - Loss: 0.3357091546058655
Step: 21 - Loss: 0.29853928089141846
Step: 22 - Loss: 0.7205965518951416
Step: 23 - Loss: 0.687757670879364
Step: 24 - Loss: 0.36139172315597534
Step: 25 - Loss: 0.3517206013202667
Step: 26 - Loss: 0.4545828104019165
Step: 27 - Loss: 0.580516457557

KeyError: 'intent_labels'