In [2]:
from transformers import BertModel, BertTokenizer

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
bert_model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

In [3]:
from sklearn.model_selection import train_test_split
import json
import os
from collections import Counter 

def load_data(path):
    '''
        input: path/to/data
        output: json 
    '''
    dataset = []
    with open(path) as f:
        dataset = json.loads(f.read())
    return dataset

tmp_train_raw = load_data(os.path.join('../dataset','ATIS','train.json'))
test_raw = load_data(os.path.join('../dataset','ATIS','test.json'))
print('Train samples:', len(tmp_train_raw))
print('Test samples:', len(test_raw))
# pprint(tmp_train_raw[0])

portion = 0.10

intents = [x['intent'] for x in tmp_train_raw] # We stratify on intents
count_y = Counter(intents)

labels = []
inputs = []
mini_train = []

for id_y, y in enumerate(intents):
    if count_y[y] > 1: # If some intents occurs only once, we put them in training
        inputs.append(tmp_train_raw[id_y])
        labels.append(y)
    else:
        mini_train.append(tmp_train_raw[id_y])
# Random Stratify
X_train, X_dev, y_train, y_dev = train_test_split(inputs, labels, test_size=portion, 
                                                    random_state=42, 
                                                    shuffle=True,
                                                    stratify=labels)
X_train.extend(mini_train)
train_raw = X_train
dev_raw = X_dev

y_test = [x['intent'] for x in test_raw]

print('TRAIN size:', len(train_raw))
print('DEV size:', len(dev_raw))
print('TEST size:', len(test_raw))

  from scipy.sparse import issparse


Train samples: 4978
Test samples: 893
TRAIN size: 4480
DEV size: 498
TEST size: 893


In [4]:
PAD_TOKEN = 0

# Count the number of unique words, slot and intent tags
words_set = set()
slot_set = set()
intent_set = set()

for example in train_raw:
    for word in example['utterance'].split():
        if word not in words_set:
            words_set.add(word)
    for slot in example['slots'].split():
        if slot not in slot_set:
            slot_set.add(slot)
    if example['intent'] not in intent_set:
        intent_set.add(example['intent'])
        
for example in dev_raw:
    # for word in example['utterance'].split():
    #     if word not in words_set:
    #         words_set.add(word)
    for slot in example['slots'].split():
        if slot not in slot_set:
            slot_set.add(slot)
    if example['intent'] not in intent_set:
        intent_set.add(example['intent'])
        
for example in test_raw:
    # for word in example['utterance'].split():
    #     if word not in words_set:
    #         words_set.add(word)
    for slot in example['slots'].split():
        if slot not in slot_set:
            slot_set.add(slot)
    if example['intent'] not in intent_set:
        intent_set.add(example['intent'])

num_words = len(words_set)
num_intent_labels = len(intent_set)
num_slot_labels = len(slot_set)

print('# Words:', num_words)
print('# Slots:', num_slot_labels)
print('# Intent:', num_intent_labels)

# Words: 864
# Slots: 129
# Intent: 26


In [5]:
intent_set

{'abbreviation',
 'aircraft',
 'aircraft+flight+flight_no',
 'airfare',
 'airfare+flight',
 'airfare+flight_time',
 'airline',
 'airline+flight_no',
 'airport',
 'capacity',
 'cheapest',
 'city',
 'day_name',
 'distance',
 'flight',
 'flight+airfare',
 'flight+airline',
 'flight_no',
 'flight_no+airline',
 'flight_time',
 'ground_fare',
 'ground_service',
 'ground_service+ground_fare',
 'meal',
 'quantity',
 'restriction'}

In [6]:
import torch
import torch.nn as nn

class IntentSlotModel(nn.Module):
    def __init__(self, bert_model, num_intent_labels, num_slot_labels):
        super(IntentSlotModel, self).__init__()
        self.bert = bert_model
        self.intent_classifier = nn.Linear(bert_model.config.hidden_size, num_intent_labels)
        self.slot_classifier = nn.Linear(bert_model.config.hidden_size, num_slot_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        pooled_output = outputs.pooler_output
        
        intent_logits = self.intent_classifier(pooled_output)
        slot_logits = self.slot_classifier(sequence_output)
        
        return intent_logits, slot_logits

# Example: Assuming we have 10 intents and 20 slot labels
model = IntentSlotModel(bert_model, num_intent_labels, num_slot_labels)


In [11]:
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW

slot_labels_list = list(slot_set)
intent_labels_list = list(intent_set)

slot_label_map = {label: idx for idx, label in enumerate(slot_labels_list)}
intent_label_map = {label: idx for idx, label in enumerate(intent_labels_list)}

# Extract data from raw dataset
inputs = [t["utterance"] for t in train_raw]
intent_labels = [intent_label_map[t["intent"]] for t in train_raw]
slot_labels = [
    [slot_label_map[label] for label in t["slots"].split()] for t in train_raw
]

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
bert_model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize inputs
encoded_inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
input_ids = encoded_inputs["input_ids"]
attention_mask = encoded_inputs["attention_mask"]

# Pad slot labels to match the length of input_ids
max_len = input_ids.shape[1]
padded_slot_labels = []
for label in slot_labels:
    padded_label = label + [slot_label_map["O"]] * (
        max_len - len(label)
    )  # Padding with "O"
    padded_slot_labels.append(padded_label)
slot_labels = torch.tensor(padded_slot_labels)

# Convert intent labels to tensors
intent_labels = torch.tensor(intent_labels)

# Define custom model
class IntentSlotModel(nn.Module):
    def __init__(self, bert_model, num_intent_labels, num_slot_labels):
        super(IntentSlotModel, self).__init__()
        self.bert = bert_model
        self.intent_classifier = nn.Linear(
            bert_model.config.hidden_size, num_intent_labels
        )
        self.slot_classifier = nn.Linear(bert_model.config.hidden_size, num_slot_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        pooled_output = outputs.pooler_output

        intent_logits = self.intent_classifier(pooled_output)
        slot_logits = self.slot_classifier(sequence_output)

        return intent_logits, slot_logits


# Initialize model
num_intent_labels = len(intent_labels_list)
num_slot_labels = len(slot_labels_list)
model = IntentSlotModel(bert_model, num_intent_labels, num_slot_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
intent_labels = intent_labels.to(device)
slot_labels = slot_labels.to(device)




# Create a DataLoader
dataset = TensorDataset(input_ids, attention_mask, intent_labels, slot_labels)
dataloader = DataLoader(dataset, batch_size=64)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Loss functions
intent_loss_fn = nn.CrossEntropyLoss()
slot_loss_fn = nn.CrossEntropyLoss(ignore_index=slot_label_map["O"])

# Training loop
model.train()
for epoch in range(3):  # Example: 3 epochs
    for batch in dataloader:
        input_ids, attention_mask, intent_labels, slot_labels = batch

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        intent_labels = intent_labels.to(device)
        slot_labels = slot_labels.to(device)

        # Forward pass
        intent_logits, slot_logits = model(input_ids, attention_mask)

        # Compute losses
        intent_loss = intent_loss_fn(intent_logits, intent_labels)
        slot_loss = slot_loss_fn(
            slot_logits.view(-1, num_slot_labels), slot_labels.view(-1)
        )
        loss = intent_loss + slot_loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch: {epoch}, Loss: {loss.item()}")

# # Tokenize inputs
# encoded_inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
# input_ids = encoded_inputs['input_ids']
# attention_mask = encoded_inputs['attention_mask']

# # Pad slot labels to match the length of input_ids
# max_len = input_ids.shape[1]
# padded_slot_labels = []
# for label in slot_labels:
#     padded_label = label + [0] * (max_len - len(label))  # Padding with 0 (or any other pad token you use)
#     padded_slot_labels.append(padded_label)
# slot_labels = torch.tensor(padded_slot_labels)

# # Convert intent labels to tensors
# intent_labels = torch.tensor(intent_labels)

Epoch: 0, Loss: 7.967977523803711
Epoch: 0, Loss: 7.698600769042969
Epoch: 0, Loss: 7.174617767333984
Epoch: 0, Loss: 6.648371696472168
Epoch: 0, Loss: 6.421142101287842
Epoch: 0, Loss: 5.908531665802002
Epoch: 0, Loss: 5.897011756896973
Epoch: 0, Loss: 5.22761344909668
Epoch: 0, Loss: 5.132645606994629
Epoch: 0, Loss: 5.0595245361328125
Epoch: 0, Loss: 4.609044551849365
Epoch: 0, Loss: 4.30339241027832
Epoch: 0, Loss: 4.374334812164307
Epoch: 0, Loss: 4.241650581359863
Epoch: 0, Loss: 4.273484230041504
Epoch: 0, Loss: 3.696341037750244
Epoch: 0, Loss: 3.663994312286377
Epoch: 0, Loss: 3.6092495918273926
Epoch: 0, Loss: 3.8726887702941895
Epoch: 0, Loss: 3.46073055267334
Epoch: 0, Loss: 3.3968920707702637
Epoch: 0, Loss: 3.118438482284546
Epoch: 0, Loss: 3.639385223388672
Epoch: 0, Loss: 3.339080810546875
Epoch: 0, Loss: 3.5038058757781982
Epoch: 0, Loss: 3.288001537322998
Epoch: 0, Loss: 3.2868874073028564
Epoch: 0, Loss: 3.103123903274536
Epoch: 0, Loss: 3.159627914428711
Epoch: 0, L

KeyboardInterrupt: 

In [13]:
from pprint import pprint
for data in dataloader:
    pprint(data)
    break

[tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 7599, 2013,  ...,    0,    0,    0],
        [ 101, 1045, 2052,  ...,    0,    0,    0],
        ...,
        [ 101, 1045, 2342,  ...,    0,    0,    0],
        [ 101, 7599, 2013,  ...,    0,    0,    0],
        [ 101, 3531, 2862,  ...,    0,    0,    0]], device='cuda:0'),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'),
 tensor([ 7, 22, 22, 14, 22, 22, 22,  7, 22,  7, 22, 22, 22, 22, 22, 22, 22, 22,
        22, 22, 14,  7, 22,  4, 12,  7,  5, 22, 22,  5,  7, 22, 22, 22, 22,  7,
        22, 15, 22, 22, 22, 22,  7, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
        22, 22, 22, 22, 22, 22, 22, 22, 22, 14], device='cuda:0'),
 tensor([[38, 38, 38,  ..., 38, 38, 38],
        [38, 38, 95,  ..., 38, 38, 38],
        [38, 38, 38,  ..., 38

In [6]:
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW

# Create a DataLoader
dataset = TensorDataset(input_ids, attention_mask, intent_labels, slot_labels)
dataloader = DataLoader(dataset, batch_size=2)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Loss functions
intent_loss_fn = nn.CrossEntropyLoss()
slot_loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # Ignore pad token in slot loss

# Training loop
model.train()
for epoch in range(3):  # Example: 3 epochs
    for batch in dataloader:
        input_ids, attention_mask, intent_labels, slot_labels = batch
        
        # Forward pass
        intent_logits, slot_logits = model(input_ids, attention_mask)
        
        # Compute losses
        intent_loss = intent_loss_fn(intent_logits, intent_labels)
        slot_loss = slot_loss_fn(slot_logits.view(-1, num_slot_labels), slot_labels.view(-1))
        loss = intent_loss + slot_loss
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print(f"Epoch: {epoch}, Loss: {loss.item()}")




Epoch: 0, Loss: 5.883545875549316
Epoch: 1, Loss: 4.518344879150391
Epoch: 2, Loss: 3.9565749168395996
