In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m66.8 MB/s[0m eta [36m0:00:0

In [2]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim import AdamW
import os, json, pprint
import urllib.request
from sklearn.preprocessing import LabelEncoder
import torch


In [3]:


# training dataset
# store the response of URL and load the json file
url = 'https://raw.githubusercontent.com/DanielNorth/NLU-2023-Labs/main/labs/dataset/ATIS/train.json'
response = urllib.request.urlopen(url)
tmp_train_raw = json.load(response)

# load all the sentences, intent labels and slot labels as seperate list
sentences_train = [i['utterance'] for i in tmp_train_raw]
intent_labels_train = [i['intent'] for i in tmp_train_raw]
slot_labels_train = [i['slots'] for i in tmp_train_raw]

#slot_labels_train = [ ' '.join(word for word in sentence.split() if word != 'O') for sentence in slot_labels_train_paded]


# number of unique value for intent labels and slot labels
intent_labels_train_set_length = len(set(intent_labels_train))
slot_labels_train_set_length = len(set(slot_labels_train))

In [None]:
slot_labels_train[2]

'B-flight_time I-flight_time B-fromloc.city_name I-fromloc.city_name B-depart_time.time I-depart_time.time B-fromloc.city_name'

# New Section

In [4]:
def intents_label_encoding(intent_labels, all_intent_labels):
  label_encoder = LabelEncoder()
  label_encoder.fit(all_intent_labels)

  return label_encoder.transform(intent_labels)


def slot_labels_encoding(labels, dataset):
    # Create an instance of LabelEncoder
    encoder = LabelEncoder()

    slot_labels2 = [label for sentence in dataset for label in sentence.split()]
    encoder.fit(slot_labels2)
    encoded_slot2 = [encoder.transform(sentence.split()).tolist() for sentence in labels]

    encoded_slot = [[num + 1 for num in sublist] for sublist in encoded_slot2]

    return encoded_slot

In [5]:
def encode_data(sentences, encoded_intents, encoded_slot, tokenizer):
    input_ids = []
    attention_masks = []
    intent_labels_encoded = []
    slot_labels_encoded = []

    cat_slot = []

    for sentence, intent, criterion_slots in zip(sentences, encoded_intents, encoded_slot):
          # Encode sentence
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
          )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

        criterion_slots += [0] * (128 - len(criterion_slots))

        slot_labels_encoded.append(criterion_slots)

    slot_labels_encoded = torch.tensor(slot_labels_encoded)

    return {
        'input_ids': torch.cat(input_ids, dim=0),
        'attention_mask': torch.cat(attention_masks, dim=0),
        'intent_labels': torch.tensor(encoded_intents),
        'slot_labels': slot_labels_encoded
        }




In [6]:
encoded_intents_train = intents_label_encoding(intent_labels_train, intent_labels_train)
encoded_slots_train = slot_labels_encoding(slot_labels_train, slot_labels_train)

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_data = encode_data(sentences_train, encoded_intents_train, encoded_slots_train, tokenizer)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
print(encoded_data['input_ids'].size())
print(encoded_data['attention_mask'].size())
print(encoded_data['intent_labels'].size())
print(encoded_data['slot_labels'].size())

torch.Size([4978, 128])
torch.Size([4978, 128])
torch.Size([4978])
torch.Size([4978, 128])


In [8]:
class ModelIAS(nn.Module):
    def __init__(self, num_of_slot, num_of_intent, n_layer=1, pad_index=0):
        super(ModelIAS, self).__init__()
        # hid_size = Hidden size
        # out_slot = number of slots (output size for slot filling)
        # out_int = number of intents (ouput size for intent class)
        # emb_size = word embedding size
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        hid_size = self.bert.config.hidden_size
        self.hid_size = hid_size


        #self.utt_encoder = nn.LSTM(hid_size, hid_size, n_layer, bidirectional=True)
        self.slot_out = nn.Linear(hid_size*2, num_of_slot)

        self.utt_encoder = nn.LSTM(hid_size, hid_size, n_layer, bidirectional=False)
        self.slot_out = nn.Linear(hid_size, num_of_slot)

        self.intent_out = nn.Linear(hid_size, num_of_intent)
        # Dropout layer How do we apply it?
        self.dropout = nn.Dropout(0.1)


    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_output.last_hidden_state

        pooled_output = bert_output.pooler_output
        pooled_output = self.dropout(pooled_output)


        # Convert to dense tensor
        sequence_output = sequence_output.to_dense()

        # Apply LSTM
        lstm_output, _ = self.utt_encoder(sequence_output)


        # Concatenate the forward and backward hidden states
        lstm_output = torch.cat((lstm_output[:, :, :self.hid_size], lstm_output[:, :, self.hid_size:]), dim=2)
        #lstm_output = torch.cat((lstm_output[-2, :, :], lstm_output[-1,:, :]), dim=1)

        # Apply dropout
        lstm_output = self.dropout(lstm_output)

        # Compute slot logits
        slot_logits = self.slot_out(lstm_output)

        intent_logits = self.intent_out(pooled_output)

        return intent_logits, slot_logits

model = ModelIAS(slot_labels_train_set_length, intent_labels_train_set_length).to('cuda')

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
# model.to("cuda")

# Define hyperparameters
epochs = 10
lr = 1e-5
optimizer = AdamW(model.parameters(), lr=lr)
total_steps = len(slot_labels_train) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
batch_size = 64

In [10]:
dataset = TensorDataset(encoded_data['input_ids'], encoded_data['attention_mask'], encoded_data['intent_labels'], encoded_data['slot_labels'])
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [11]:
intent_loss_fn = nn.CrossEntropyLoss()
slot_loss_fn = nn.CrossEntropyLoss(ignore_index=0)

In [13]:
dataset = TensorDataset(encoded_data['input_ids'], encoded_data['attention_mask'], encoded_data['intent_labels'], encoded_data['slot_labels'])
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

from sklearn.model_selection import KFold

# Define the K-fold Cross Validator
kfold = KFold(n_splits=5, shuffle=True)

# K-fold Cross Validation model evaluation
for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):

    # Print
    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)

    # Define data loaders for training and testing data in this fold
    trainloader = torch.utils.data.DataLoader(
                      dataset,
                      batch_size=64, sampler=train_subsampler)
    testloader = torch.utils.data.DataLoader(
                      dataset,
                      batch_size=64, sampler=test_subsampler)


    for epoch in range(epochs):
        total_loss = 0

        # Set the model to training mode
        model.train()

        # Loop over each batch from the training set
        for batch in trainloader:
            # Copy each tensor to the GPU
            input_ids = batch[0].to("cuda")
            attention_mask = batch[1].to("cuda")
            intent_labels = batch[2].to("cuda")
            slot_labels = batch[3].to("cuda")

            # Zero the gradients
            optimizer.zero_grad()

            # Perform a forward pass
            intent_logits, slot_logits = model(input_ids, attention_mask)

            # Compute the loss
            intent_loss = intent_loss_fn(intent_logits, intent_labels)
            slot_loss = slot_loss_fn(slot_logits.view(-1, slot_logits.shape[-1]), slot_labels.view(-1))


            # Compute the total loss
            total_loss = intent_loss + slot_loss

            # Perform a backward pass
            total_loss.backward()
            # Update the weights
            optimizer.step()
            # Add the loss to the total loss
            total_loss += total_loss.item()

        # Adjust the learning rate
        scheduler.step()
        total_val_loss = 0

        model.eval()
        for batch in testloader:
            # Copy each tensor to the GPU
            input_ids = batch[0].to("cuda")
            attention_mask = batch[1].to("cuda")
            intent_labels = batch[2].to("cuda")
            slot_labels = batch[3].to("cuda")

            with torch.no_grad():
                # Perform a forward pass
                intent_logits, slot_logits = model(input_ids, attention_mask)

                # Compute the loss
                intent_loss = intent_loss_fn(intent_logits, intent_labels)
                slot_loss = slot_loss_fn(slot_logits.view(-1, slot_logits.shape[-1]), slot_labels.view(-1))


                # Compute the total loss
                val_loss = intent_loss + slot_loss
                total_val_loss += val_loss

        avg_val_loss = total_val_loss / len(testloader)
        print(f"epoch: {epoch}, Training loss: {total_loss},average validation loss: {avg_val_loss} ")









FOLD 0
--------------------------------
epoch: 0, Training loss: 13.430728912353516,average validation loss: 6.001148700714111 


KeyboardInterrupt: ignored

In [None]:
# Training loop
for epoch in range(epochs):
    total_loss = 0

    # Set the model to training mode
    model.train()

    # Loop over each batch from the training set
    for batch in dataloader:
        # Copy each tensor to the GPU
        input_ids = batch[0].to("cuda")
        attention_mask = batch[1].to("cuda")
        intent_labels = batch[2].to("cuda")
        slot_labels = batch[3].to("cuda")

        # print(input_ids.size())
        # print(attention_mask.size())

        # slot_labels = slot_labels.to_dense()
        # intent_labels = intent_labels.to_dense()

        # Zero the gradients
        optimizer.zero_grad()

        # Perform a forward pass
        intent_logits, slot_logits = model(input_ids, attention_mask)

        # Compute the loss
        intent_loss = intent_loss_fn(intent_logits, intent_labels)
        slot_loss = slot_loss_fn(slot_logits.view(-1, slot_logits.shape[-1]), slot_labels.view(-1))


        # Compute the total loss
        total_loss = intent_loss + slot_loss

        # Perform a backward pass
        total_loss.backward()
        # Update the weights
        optimizer.step()

        # Add the loss to the total loss
        total_loss += total_loss.item()

    # Adjust the learning rate
    scheduler.step()
    print("Epoch: {}, Loss: {:.3f}".format(epoch+1, total_loss))

torch.save(model.state_dict(), "mymodel.pt")

Epoch: 1, Loss: 10.060
Epoch: 2, Loss: 7.041
Epoch: 3, Loss: 5.823
Epoch: 4, Loss: 3.854
Epoch: 5, Loss: 4.240
Epoch: 6, Loss: 3.498
Epoch: 7, Loss: 2.725
Epoch: 8, Loss: 2.827
Epoch: 9, Loss: 2.528


KeyboardInterrupt: ignored

In [None]:
url_test = 'https://raw.githubusercontent.com/DanielNorth/NLU-2023-Labs/main/labs/dataset/ATIS/train.json'
response_test = urllib.request.urlopen(url_test)
tmp_train_raw_test = json.load(response_test)

# load all the sentences, intent labels and slot labels as seperate list
sentences_test = [i['utterance'] for i in tmp_train_raw_test]
intent_labels_test = [i['intent'] for i in tmp_train_raw_test]
slot_labels_test = [i['slots'] for i in tmp_train_raw_test]

# number of unique value for intent labels and slot labels
intent_labels_test_set_length = len(set(intent_labels_test))
slot_labels_test_set_length = len(set(slot_labels_test))

# Instantiate the model
model = ModelIAS(slot_labels_test_set_length, intent_labels_test_set_length).to("cuda")

encoded_intents_test = intents_label_encoding(intent_labels_test, intent_labels_train)
encoded_slots_test = slot_labels_encoding(slot_labels_test, slot_labels_train)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
encoded_intents_test = intents_label_encoding(intent_labels_test, intent_labels_train)
encoded_slots_test = slot_labels_encoding(slot_labels_test, slot_labels_train)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_data = encode_data(sentences_test, encoded_intents_test, encoded_slots_test, tokenizer)

test_dataset = TensorDataset(encoded_data['input_ids'], encoded_data['attention_mask'], encoded_data['intent_labels'], encoded_data['slot_labels'])
test_dataloader = DataLoader(test_dataset, batch_size=50, shuffle=True)

In [None]:
device = 'cuda'

model = ModelIAS(slot_labels_train_set_length, intent_labels_train_set_length).to('cuda')
state_dict = torch.load('mymodel.pt')
#model.load_state_dict(torch.load('mymodel.pt'))
# model.load_state_dict(torch.load('bin/myModel.pt'))

# Evaluation
model.eval()

# Tracking variables
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0

intent_preds, slot_preds, true_intents, true_slots = [], [], [], []


# Evaluate data for one epoch
for batch in test_dataloader:

    # Unpack this training batch from our dataloader
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_intent_labels = batch[2].to(device)
    b_slot_labels = batch[3].to(device)

    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training)
    with torch.no_grad():

        outputs = model(b_input_ids,
                        attention_mask=b_input_mask,
                        )

    intent_logits = outputs[0]
    slot_logits = outputs[1]

    # Accumulate the validation loss
    # Compute the loss
    intent_loss = intent_loss_fn(intent_logits, intent_labels)
    slot_loss = slot_loss_fn(slot_logits.view(-1, slot_logits.shape[-1]), slot_labels.view(-1))
    # Compute the total loss
    total_loss = intent_loss + slot_loss

    # Add the loss to the total loss
    total_loss += total_loss.item()


    # Move logits and labels to CPU
    intent_logits = intent_logits.detach().cpu().numpy()
    slot_logits = slot_logits.detach().cpu().numpy()

    intent_label_ids = intent_labels.to('cpu').numpy()
    slot_labels_ids = slot_labels.to('cpu').numpy()

    # Calculate the accuracy for this batch
    #total_eval_accuracy += flat_accuracy(intent_logits, intent_label_ids)

    # Save predictions and true labels
    intent_preds.append(intent_logits)
    slot_preds.append(slot_logits)
    true_intents.append(intent_label_ids)
    true_slots.append(slot_labels_ids)

# Report the final accuracy for this validation run
# avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
# print("Accuracy: {0:.2f}".format(avg_val_accuracy))

# Calculate the average loss over all of the batches
avg_val_loss = total_eval_loss / len(test_dataloader)
print("Validation Loss: {0:.2f}".format(avg_val_loss))

# Concatenate all predictions and true values
intent_preds = np.concatenate(intent_preds, axis=0)
slot_preds = np.concatenate(slot_preds, axis=0)

true_intents = np.concatenate(true_intents, axis=0)
true_slots = np.concatenate(true_slots, axis=0)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ValueError: ignored