In [1]:
import json
import logging
import os
from collections import Counter
from sklearn.model_selection import train_test_split


def load_data(path):
    dataset = []
    with open(path, "r") as file:
        dataset = json.loads(file.read())

    return dataset

def split_sets(tmp_train_raw):
    portion = 0.10

    intents = [x["intent"] for x in tmp_train_raw]  # We stratify on intents
    count_y = Counter(intents)

    labels = []
    inputs = []
    mini_train = []

    for id_y, y in enumerate(intents):
        if count_y[y] > 1:  # If some intents occurs only once, we put them in training
            inputs.append(tmp_train_raw[id_y])
            labels.append(y)
        else:
            mini_train.append(tmp_train_raw[id_y])

    # Random Stratify
    X_train, X_dev, _, _ = train_test_split(
        inputs,
        labels,
        test_size=portion,
        random_state=42,
        shuffle=True,
        stratify=labels,
    )
    X_train.extend(mini_train)
    train_raw = X_train
    dev_raw = X_dev

    return train_raw, dev_raw

def get_data(
    train=os.path.join("../dataset", "ATIS", "train.json"),
    test=os.path.join("../dataset", "ATIS", "test.json"),
):

    tmp_train_raw = load_data(train)
    test_raw = load_data(test)

    train_raw, dev_raw = split_sets(tmp_train_raw)

    logging.info("Train size: %d", len(train_raw))
    logging.info("Dev size: %d", len(dev_raw))
    logging.info("Test size: %d", len(test_raw))

    return train_raw, dev_raw, test_raw

train_raw, dev_raw, test_raw = get_data()

  from scipy.sparse import issparse


## Modify train_row, dev_raw and test_raw
So that it can be tokenized and put well.

In [2]:
slots_set = set()
intents_set = set()

for phrases in [train_raw, dev_raw, test_raw]:
    for phrase in phrases:
        for slot in phrase["slots"].split():
            slots_set.add(slot)
        intents_set.add(phrase["intent"])

slots2id = {"pad": 0}
id2slots = {0: "O"}
for slot in (slots_set):
    slots2id[slot] = len(slots2id)
    id2slots[len(id2slots)] = slot

intent2id = {}
id2intent = {}
for intent in (intents_set):
    intent2id[intent] = len(intent2id)
    id2intent[len(id2intent)] = intent
    

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_and_preserve_labels(sentence, text_labels):
    text_labels = text_labels.split()
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence.split(), text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)
        # labels.extend([label] + ["X"] * (n_subwords - 1))
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

def tokenize_data(raw_data):
    processed_data = []
    for dset in raw_data:
        tokenized_set = {}
        tokenized_set["raw_intent"] = dset["intent"]
        tokenized_set["raw_slots"] = dset["slots"]
        tokenized_set["raw_utterance"] = dset["utterance"]

        tokenized_sentence, adapted_labels = tokenize_and_preserve_labels(
            dset["utterance"], dset["slots"]
        )

        tokenized_set["tokenized_utterance"] = tokenized_sentence
        tokenized_set["tokenized_slots"] = adapted_labels

        processed_data.append(tokenized_set)
    return processed_data

def encode_data(tokenized_data):
    encoded_data = []
    for dset in tokenized_data:
        encoded_set = {}
        
        encoded_set["raw_intent"] = dset["raw_intent"]
        encoded_set["raw_slots"] = dset["raw_slots"]
        encoded_set["raw_utterance"] = dset["raw_utterance"]
        encoded_set["tokenized_utterance"] = dset["tokenized_utterance"]
        encoded_set["tokenized_slots"] = dset["tokenized_slots"]

        # Encode the tokenized utterance
        encoded_set["encoded_utterance"] = tokenizer.encode_plus(dset["tokenized_utterance"], add_special_tokens=False)

        # Encode the tokenized slots
        encoded_set["encoded_slots"] = [slots2id[slot] for slot in dset["tokenized_slots"]]

        # Encode the intent
        encoded_set["encoded_intent"] = intent2id[dset["raw_intent"]]

        encoded_data.append(encoded_set)
    return encoded_data

def preprocess_data(raw_data):
    # Tokenize `utterance` and `slots` with sub-token labelling. The subtoken is labelled with `X`.
    # The `X` label is used to indicate that the subtoken is not the first subtoken of a word.
    processed_data = tokenize_data(raw_data)

    # Encode the tokenized data
    encoded_data = encode_data(processed_data)

    return encoded_data

In [4]:
err = 0
for tokenized_data in processed_train:
    if ((len(tokenized_data["encoded_utterance"]["input_ids"]) - len(tokenized_data["encoded_slots"])) != 0):
        err += 1

        print(tokenized_data["tokenized_utterance"])
        print(tokenized_data["tokenized_slots"])
        print(tokenized_data["encoded_utterance"])
        print(tokenized_data["encoded_slots"])
        print(tokenized_data["encoded_intent"])
        
        print("\n\n")
        break

print(err)


0


In [5]:
print(len(processed_train[0]["tokenized_utterance"]))
print(len(processed_train[0]["tokenized_slots"]))
print(len(processed_train[0]["encoded_utterance"]["input_ids"]))
print(len(processed_train[0]["encoded_slots"]))
print((processed_train[0]["encoded_intent"]))

11
11
11
11
12


In [6]:
processed_train[0]

{'raw_intent': 'airfare',
 'raw_slots': 'O O O O O O O O B-fromloc.city_name O B-toloc.city_name',
 'raw_utterance': 'what is the cost for these flights from baltimore to philadelphia',
 'tokenized_utterance': ['what',
  'is',
  'the',
  'cost',
  'for',
  'these',
  'flights',
  'from',
  'baltimore',
  'to',
  'philadelphia'],
 'tokenized_slots': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-fromloc.city_name',
  'O',
  'B-toloc.city_name'],
 'encoded_utterance': {'input_ids': [2054, 2003, 1996, 3465, 2005, 2122, 7599, 2013, 6222, 2000, 4407], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 'encoded_slots': [12, 12, 12, 12, 12, 12, 12, 12, 102, 12, 14],
 'encoded_intent': 12}

# TODO:
1. Create new torch.data.Dataset child with the new data
2. Create custom collate_fn for the `DataLoader`

Question: where to move data from to GPU? The easiest solution would be in the `__getitem__` of the `Dataset`, but this is not optimal (non batched operation, it probably requires gpu sync). If it's possible to do it inside the collate fn then it would be awesome.

In [31]:
from torch.utils.data import Dataset, DataLoader

class ATISDataset(Dataset):
    def __init__ (self, processed_data):
        self.input = [data["encoded_utterance"]["input_ids"] for data in processed_data]
        self.attention_mask = [data["encoded_utterance"]["attention_mask"] for data in processed_data]
        self.token_type_ids = [data["encoded_utterance"]["token_type_ids"] for data in processed_data]
        self.slots = [data["encoded_slots"] for data in processed_data]
        self.intent = [data["encoded_intent"] for data in processed_data]

    def __len__(self):
        return len(self.input)
    
    def __getitem__(self, idx):
        return self.input[idx], self.attention_mask[idx], self.token_type_ids[idx], self.slots[idx], self.intent[idx]

In [8]:
train_dataset[0]

([2054, 2003, 1996, 3465, 2005, 2122, 7599, 2013, 6222, 2000, 4407],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [12, 12, 12, 12, 12, 12, 12, 12, 102, 12, 14],
 12)

Note to yourself:

if you add some padding, it's zeroes everywhere.

([2054, 2003, 1996, 3465, 2005, 2122, 7599, 2013, 6222, 2000, 4407, 0, 0],  
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],  
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  
 [12, 12, 12, 12, 12, 12, 12, 12, 55, 12, 91, 0, 0],  
 14)

In [9]:
import torch

def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_rn(data):
    device = get_device()

    # Get the max length of the input sequence
    max_len = max([len(sentence) for sentence, _, _, _, _ in data])

    # PAD all the input sequences to the max length
    slots_len = torch.tensor([len(slots) for _, _, _, slots, _ in data]).to(device)
    input_ids = torch.tensor([sentence + [0] * (max_len - len(sentence)) for sentence, _, _, _, _ in data]).to(device)
    attention_mask = torch.tensor([[1] * len(mask) + [0] * (max_len - len(mask)) for _, mask, _, _, _ in data]).to(device)
    token_type_ids = torch.tensor([token_type_ids + [0] * (max_len - len(token_type_ids)) for _, _, token_type_ids, _, _ in data]).to(device)
    slots = torch.tensor([slots + [0] * (max_len - len(slots)) for _, _, _, slots, _ in data]).to(device)
    intent = torch.tensor([intent for _, _, _, _, intent in data]).to(device)

    return {
        "slots_len": slots_len,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids,
        "slots": slots,
        "intent": intent
    }

In [10]:
train_dataset = ATISDataset(processed_train)
test_dataset = ATISDataset(processed_train)
dev_dataset = ATISDataset(processed_train)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_rn)
dev_dataloader = DataLoader(dev_dataset, batch_size=64, shuffle=False, collate_fn=collate_rn)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_rn)

In [11]:
for dataloader in train_dataloader:
    print(dataloader["input_ids"].shape)
    print(dataloader["attention_mask"].shape)
    print(dataloader["token_type_ids"].shape)
    print(dataloader["slots"].shape)
    print(dataloader["intent"].shape)
    
    break

torch.Size([64, 24])
torch.Size([64, 24])
torch.Size([64, 24])
torch.Size([64, 24])
torch.Size([64])


## Try to feed the model

In [12]:
from transformers import BertModel


class IntentSlotModel(torch.nn.Module):
    def __init__(self, slot_len, intent_len):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")

        self.intent_classifier = torch.nn.Linear(
            self.bert.config.hidden_size, intent_len
        )
        self.slot_classifier = torch.nn.Linear(self.bert.config.hidden_size, slot_len)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs.last_hidden_state
        pooled_output = outputs.pooler_output

        intent_logits = self.intent_classifier(pooled_output)
        slot_logits = self.slot_classifier(sequence_output)

        return intent_logits, slot_logits

## Training loop

In [24]:
from sklearn.metrics import classification_report
from conll import evaluate
from transformers import get_linear_schedule_with_warmup

def calculate_loss(
    intent_loss_fn, slot_loss_fn, intent_logits, slot_logits, intent_labels, slot_labels
):
    intent_loss = intent_loss_fn(intent_logits, intent_labels)
    # intent_loss = 0
    slot_loss = slot_loss_fn(slot_logits.view(-1, len(slots2id)), slot_labels.view(-1))
    # slot_loss = 0
    return intent_loss + slot_loss

def eval_loop(
    model: IntentSlotModel,
    dataloader,
    intent_loss_fn,
    slot_loss_fn,
):

    model.eval()
    total_loss = []

    ref_intents = []
    hyp_intents = []

    ref_slots = []
    hyp_slots = []

    with torch.no_grad():
        for data in dataloader:
            # input_ids, attention_mask, intent_labels, slot_labels = data
            # intent_labels = intent_labels.squeeze(1)

            # Forward
            intent_logits, slot_logits = model(data["input_ids"], data["attention_mask"], data["token_type_ids"])
            total_loss.append(
                calculate_loss(
                    intent_loss_fn=intent_loss_fn,
                    intent_logits=intent_logits,
                    intent_labels=data["intent"],
                    slot_loss_fn=slot_loss_fn,
                    slot_logits=slot_logits,
                    slot_labels=data["slots"],
                )
            )

            intent_hyp = torch.argmax(intent_logits, dim=1)
            slot_hyp = torch.argmax(slot_logits, dim=2)

            # Intent inference
            ref_intents.extend(data["intent"].to("cpu").tolist())
            hyp_intents.extend(intent_hyp.to("cpu").tolist())
                    
            # Slot filling inference
            input_ids = data["input_ids"].to("cpu").tolist()
            if data["slots"].shape != slot_hyp.shape and data["slots"].shape != input_ids.shape:
                print("Shape mismatch")
                print(data["slots"].shape)
                print(slot_hyp.shape)
                print(input_ids.shape)
                exit()

            for input, s_ref, s_hyp, seq_length in zip(input_ids, data["slots"], slot_hyp, data["slots_len"]):
                tmp_ref = []
                tmp_hyp = []

                utterance = tokenizer.tokenize(tokenizer.decode(input, include_special_tokens=False))[:seq_length]

                for u, r,h in zip(utterance, s_ref, s_hyp):
                    tmp_ref.append((u, f"{id2slots[r.item()]}"))
                    tmp_hyp.append((u, f"{id2slots[h.item()]}"))

                ref_slots.append(tmp_ref)
                hyp_slots.append(tmp_hyp)
        
        f1_slot = evaluate(ref_slots, hyp_slots)

        accuracy_intention = classification_report(
                ref_intents,
                hyp_intents,
                output_dict=True,
                zero_division=False,
            )['accuracy']

        # print(accuracy_intention)
        # print(f1_slot["total"]["f"])
        return accuracy_intention, f1_slot["total"]["f"], total_loss


def train_loop(
    model: IntentSlotModel,
    data,
    optimizer,
    intent_loss_fn,
    slot_loss_fn,
    scheduler
):
    model.train()

    # input_ids, attention_mask, token_type_ids, slot_labels, intent_labels = data
    # intent_labels = intent_labels.squeeze(1)

    optimizer.zero_grad()
    intent_logits, slot_logits = model(data["input_ids"], data["attention_mask"], data["token_type_ids"])

    loss = calculate_loss(
        intent_loss_fn=intent_loss_fn,
        intent_logits=intent_logits,
        intent_labels=data["intent"],
        slot_loss_fn=slot_loss_fn,
        slot_logits=slot_logits,
        slot_labels=data["slots"],
    )

    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()


    return loss.item()

model = IntentSlotModel(len(slots2id), len(intent2id))
model.to(get_device())

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
intent_loss_fn = torch.nn.CrossEntropyLoss()
slot_loss_fn = torch.nn.CrossEntropyLoss(ignore_index=slots2id["pad"])

epochs = 6
total_steps = len(train_dataloader) * epochs
warmup_steps = int(0.1*total_steps)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * epochs)

In [15]:
from tqdm import tqdm

In [17]:
accuracy, f1, loss = 0, 0, [float("inf")]
bloss = 0

# Create the outer tqdm progress bar for epochs
epochs_tqdm = tqdm(range(epochs), desc=f"Epochs | Acc: {accuracy:.4f} - F1: {f1:.4f} - Loss: {sum(loss)/len(loss):.4f}")

for epoch in epochs_tqdm:
    # Create the inner tqdm progress bar for batches
    batch_tqdm = tqdm(enumerate(train_dataloader), desc=f"Batch | Loss: {bloss:.4f}", leave=False)

    for index, batch in batch_tqdm:
        bloss = train_loop(model, batch, optimizer, intent_loss_fn, slot_loss_fn, scheduler)
        batch_tqdm.set_description(f"Batch | Loss: {bloss:.4f}")
        
    accuracy, f1, loss = eval_loop(model, dev_dataloader, intent_loss_fn, slot_loss_fn)
    epochs_tqdm.set_description(f"Epochs | Acc: {accuracy:.4f} - F1: {f1:.4f} - Loss: {sum(loss)/len(loss):.4f}")

accuracy, f1, loss = eval_loop(model, test_dataloader, intent_loss_fn, slot_loss_fn)
print(f"Accuracy: {accuracy:.4f} - F1: {f1:.4f} - Loss: {sum(loss)/len(loss):.4f}")


Epochs | Acc: 0.9987 - F1: 0.9801 - Loss: 0.0432: 100%|██████████| 6/6 [01:53<00:00, 18.99s/it]


Accuracy: 0.9987 - F1: 0.9801 - Loss: 0.0432


In [30]:
for batch in train_dataloader:
    bloss = train_loop(model, batch, optimizer, intent_loss_fn, slot_loss_fn, scheduler)
        
accuracy, f1, loss = eval_loop(model, dev_dataloader, intent_loss_fn, slot_loss_fn)
print(f"Accuracy: {accuracy:.4f} - F1: {f1:.4f} - Loss: {sum(loss)/len(loss):.4f}")

Accuracy: 0.9929 - F1: 0.9295 - Loss: 0.1703


In [26]:
Accuracy: 0.9911 - F1: 0.9454 - Loss: 0.1569

SyntaxError: invalid syntax (3473973959.py, line 1)