# Joint Intent Classification and Slot filling with BERT
This notebook is based on the paper __BERT for Joint Intent Classification and Slot Filling__ by Chen et al. (2019), https://arxiv.org/abs/1902.10909 but on a different dataset made for a class project.

Ideas were also taken from https://github.com/monologg/JointBERT, which is a PyTorch implementation of the paper with the original dataset.


## Install transformers

In [126]:
!pip install transformers
#rohit was here

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Download data

In [127]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [238]:
import pandas as pd

df = pd.read_json('/content/drive/MyDrive/op.json')

intent_map = dict(enumerate(df.intents.factorize()[1]))

print(df.shape)
df.head()

(19476, 3)


Unnamed: 0,code_mixed_sentence,code_mixed_slots,intents
0,washington dc se boston ke lie udaanon kee soo...,B-fromloc.city_name B-fromloc.state_code O B-t...,flight#ground_service
1,tower air par snacks diye jaate hain kya also ...,B-airline_name I-airline_name O B-meal_descrip...,meal#airfare#flight
2,baltimore se san francisco tak udaanen,B-fromloc.city_name O B-toloc.city_name I-tolo...,flight
3,kansas city se chicago ke lie kaun see udaanen...,B-fromloc.city_name I-fromloc.city_name O B-to...,flight#abbreviation
4,8 se 9 pm ke beech denver aane ya jaane vaalee...,B-depart_time.start_time O B-depart_time.end_t...,flight


In [239]:
intent_names = list(intent_map.values())
intent_names

['flight#ground_service',
 'meal#airfare#flight',
 'flight',
 'flight#abbreviation',
 'airfare#flight',
 'airline',
 'flight#airport',
 'ground_service',
 'abbreviation',
 'airline#flight',
 'flight#airfare',
 'abbreviation#flight',
 'flight#quantity#airfare',
 'flight+airfare',
 'flight#aircraft',
 'airline+flight_no#flight',
 'airfare',
 'airfare#abbreviation',
 'airfare#flight#ground_service',
 'flight#airline#ground_service',
 'ground_service#flight',
 'airfare#ground_service',
 'aircraft#ground_service#flight',
 'flight+airfare#flight',
 'airline#airport',
 'aircraft',
 'quantity#flight',
 'flight#airline',
 'flight#meal',
 'airfare#restriction#flight',
 'flight#capacity',
 'flight#quantity',
 'flight#aircraft+flight+flight_no',
 'flight_time',
 'flight_time#flight',
 'flight#flight_time',
 'distance#quantity',
 'aircraft#flight+airfare#flight',
 'airfare+flight#flight',
 'aircraft#flight',
 'airline#flight#airfare',
 'flight+airfare#abbreviation#flight',
 'airline#ground_service'

In [240]:
intents = []
for i in intent_names:
  lis= i.split("#")
  for i in lis:
    if not i in intents:
      intents.append(i)
intents

['flight',
 'ground_service',
 'meal',
 'airfare',
 'abbreviation',
 'airline',
 'airport',
 'quantity',
 'flight+airfare',
 'aircraft',
 'airline+flight_no',
 'restriction',
 'capacity',
 'aircraft+flight+flight_no',
 'flight_time',
 'distance',
 'airfare+flight',
 'ground_fare',
 'flight_no',
 'city',
 'ground_service+ground_fare',
 'day_name',
 'airfare+flight_time',
 'flight+airline',
 'cheapest',
 'flight_no+airline']

## Read data from json files

Data is of the following format
````json5
{
  "text": "",
  "positions": [{}],
  "slots": [{}],
  "intent": ""
}
````

We will be using `text` as the input and `slots` and `intent` as lables

In [196]:
df_train = df.iloc[:int(df.shape[0]*0.8)]
df_test = df.iloc[int(df.shape[0]*0.8):]

print(df_test.shape)
print(df_train.shape)

(3896, 3)
(15580, 3)


In [197]:
### UTILITY FUNCTIONS FOR TOKENIZATIONS, MASKS AND SEGMENTS CREATION ###
### from: https://www.kaggle.com/akensert/bert-base-tf2-0-now-huggingface-transformer

def set_seed(seed):
    
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    

def convert_to_transformer_inputs(str1, tokenizer, max_sequence_length, double=True):
    
    def return_id(str1, truncation_strategy, length):

        inputs = tokenizer.encode_plus(str1,
            add_special_tokens=True,
            max_length=length,
            truncation_strategy=truncation_strategy)
        
        input_ids =  inputs["input_ids"]
        input_masks = [1] * len(input_ids)
        input_segments = inputs["token_type_ids"]
        
        padding_length = length - len(input_ids)
        padding_id = tokenizer.pad_token_id
        
        input_ids = input_ids + ([padding_id] * padding_length)
        input_masks = input_masks + ([0] * padding_length)
        input_segments = input_segments + ([0] * padding_length)
        
        return [input_ids, input_masks, input_segments]
    
    if double:
    
        input_ids_1, input_masks_1, input_segments_1 = return_id(
            str1, 'longest_first', max_sequence_length)

        return [input_ids_1, input_masks_1, input_segments_1]
    
    else:
        
        input_ids, input_masks, input_segments = return_id(
            str1, 'longest_first', max_sequence_length)

        return [input_ids, input_masks, input_segments,
                None, None, None]        

def compute_input_arrays(df, columns, tokenizer, max_sequence_length, double=True):
    
    input_ids_1, input_masks_1, input_segments_1 = [], [], []
    for _, instance in tqdm(df[columns].iterrows(), total=len(df)):
        str1 = instance[columns[0]]

        ids_1, masks_1, segments_1 = \
        convert_to_transformer_inputs(str1, tokenizer, max_sequence_length, double=double)
        
        input_ids_1.append(ids_1)
        input_masks_1.append(masks_1)
        input_segments_1.append(segments_1)

        
    if double:
        
        return [np.asarray(input_ids_1, dtype=np.int32), 
                np.asarray(input_masks_1, dtype=np.int32), 
                np.asarray(input_segments_1, dtype=np.int32)]
    
    else:
        
        return [np.asarray(input_ids_1, dtype=np.int32), 
                np.asarray(input_masks_1, dtype=np.int32), 
                np.asarray(input_segments_1, dtype=np.int32)]

## Load Tokenizer from transformers

We will use a pretrained bert model `bert-base-cased` for both Tokenizer and our classifier.

In [198]:
import tensorflow as tf
from transformers import AutoTokenizer

model_name_en = "bert-base-cased"
model_name_hi = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name_en)

# Encode texts from the dataset

We have to encode the texts using the tokenizer to create tensors for training the classifier.

In [199]:
# https://huggingface.co/transformers/preprocessing.html

def encode_texts(tokenizer, texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="tf")

texts = [d for d in df_train['code_mixed_sentence']]
tds = encode_texts(tokenizer, texts)
tds.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [200]:
encoded_texts = tds

## Encode labels
### Intents

### Slots

To padd all the texts to the same length, the tokenizer will use special characters. To handle those we need to add <PAD> to slots_names. It can be some other symbol as well.

In [201]:
df_train["intents"]
intents
index = [0]*len(intents)

full = []
for i in df_train["intents"]:
  lis = i.split("#")
  a = index.copy()
  for j in lis:
    ind = intents.index(j)
    a[ind] = 1
  full.append(a)
  a=index.copy

In [202]:
encoded_intents = tf.convert_to_tensor(full, dtype="int32")
encoded_intents

<tf.Tensor: shape=(15580, 26), dtype=int32, numpy=
array([[1, 1, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int32)>

In [203]:
slot_names = set()
for td in df.loc[:,"code_mixed_slots"]:
    for slot in td.split():
        slot_names.add(slot)
slot_names = list(slot_names)
slot_names.insert(0, "<PAD>")
slot_names.insert(1, "UNK")
slot_names

['<PAD>',
 'UNK',
 'B-state_code',
 '-arrive_time.time_relative',
 'B-meal_code',
 'OO',
 'I-depart_time.time',
 'I-depart_time.period_of_day',
 'I-time',
 'I-fromloc.city_name',
 'B-arrive_date.day_name',
 'I-fromloc.state_name',
 'I-airport_name',
 '-fromloc.airport_name',
 'B-cost_relative',
 'I-restriction_code',
 'I-toloc.state_name',
 'depart_time.period_of_day',
 'B-toloc.airport_code',
 'B-depart_time.start_time',
 'B-return_date.day_number',
 'B-arrive_time.end_time',
 'B-round_trip',
 'I-airline_name',
 'I-round_trip',
 'B-depart_date.month_name',
 'B-fromloc.airport_name',
 'B-depart_date.today_relative',
 'B-depart_time.time',
 'B-toloc.city_nam',
 'B-days_code',
 'B-today_relative',
 'B-return_date.day_name',
 'B-city_name',
 'I-arrive_time.end_time',
 'B-depart_time.end_time',
 'I-city_name',
 'I-arrive_time.time',
 'B-state_name',
 'B-day_name',
 'B-depart_date.year',
 'B-arrive_time.period_mod',
 'B-return_date.month_name',
 'B-flight',
 'I-transport_type',
 'B-return_t

In [204]:
slot_map = dict() # slot -> index
for idx, us in enumerate(slot_names):
    slot_map[us] = idx
slot_map

{'<PAD>': 0,
 'UNK': 1,
 'B-state_code': 2,
 '-arrive_time.time_relative': 3,
 'B-meal_code': 4,
 'OO': 5,
 'I-depart_time.time': 6,
 'I-depart_time.period_of_day': 7,
 'I-time': 8,
 'I-fromloc.city_name': 9,
 'B-arrive_date.day_name': 10,
 'I-fromloc.state_name': 11,
 'I-airport_name': 12,
 '-fromloc.airport_name': 13,
 'B-cost_relative': 14,
 'I-restriction_code': 15,
 'I-toloc.state_name': 16,
 'depart_time.period_of_day': 17,
 'B-toloc.airport_code': 18,
 'B-depart_time.start_time': 19,
 'B-return_date.day_number': 20,
 'B-arrive_time.end_time': 21,
 'B-round_trip': 22,
 'I-airline_name': 23,
 'I-round_trip': 24,
 'B-depart_date.month_name': 25,
 'B-fromloc.airport_name': 26,
 'B-depart_date.today_relative': 27,
 'B-depart_time.time': 28,
 'B-toloc.city_nam': 29,
 'B-days_code': 30,
 'B-today_relative': 31,
 'B-return_date.day_name': 32,
 'B-city_name': 33,
 'I-arrive_time.end_time': 34,
 'B-depart_time.end_time': 35,
 'I-city_name': 36,
 'I-arrive_time.time': 37,
 'B-state_name': 

In [205]:
def compute_slot_labels(df, column):
    all_sentences = []
    for slot in df[column]:
        
        slot_labels = []

        for s in slot.split():
            slot_labels.append(slot_map[s] if s in slot_map else slot_map["UNK"])
        all_sentences.append(slot_labels)
    return all_sentences

In [206]:
import numpy as np
encoded_slots = compute_slot_labels(df_train, "code_mixed_slots")

In [226]:
MAX_SEQUENCE_LENGTH = 164

In [227]:
def encoded_slot_array(df):
  encoded_slots_array = np.zeros(shape=(len(df['code_mixed_sentence']), MAX_SEQUENCE_LENGTH), dtype=np.int32)
  for i, slot in enumerate(encoded_slots):
    encoded_slots_array[i, 0: len(slot)] = slot
  return encoded_slots_array

In [228]:
encoded_slots_array = encoded_slot_array(df_train)

In [229]:
encoded_slots[0]
encoded_slots_array[0]

array([132,  77,  50,  90,  50,  50,  50,  50,  50,  50,  50,  33,  36,
        50,  50,  50,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

## Classifier Model

### Definition

In [230]:
from transformers import TFBertModel
from tensorflow.keras.layers import Dropout, Dense, GlobalAveragePooling1D, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

class JointIntentAndSlotFillingModel(tf.keras.Model):

    def __init__(self, intent_num_labels=None, slot_num_labels=None,
                 model_name_en=model_name_en,model_name_hi=model_name_hi, dropout_prob=0.1):
        super().__init__(name="joint_intent_slot")
        self.bert_en = TFBertModel.from_pretrained(model_name_en)
        self.bert_hi = TFBertModel.from_pretrained(model_name_hi)
        self.dropout = Dropout(dropout_prob)
        self.intent_classifier = Dense(intent_num_labels,
                                       name="intent_classifier")
        self.slot_classifier = Dense(slot_num_labels,
                                     name="slot_classifier")

    def call(self, inputs, **kwargs):
        # two outputs from BERT
        trained_bert_en = self.bert_en(inputs, **kwargs)
        trained_bert_hi = self.bert_hi(inputs, **kwargs)

        pooled_output_en = trained_bert_en.pooler_output
        sequence_output_en = trained_bert_en.last_hidden_state

        pooled_output_hi = trained_bert_hi.pooler_output
        sequence_output_hi = trained_bert_hi.last_hidden_state
        
        # sequence_output will be used for slot_filling / classification
        sequence_output_en = self.dropout(sequence_output_en,
                                       training=kwargs.get("training", False))
        sequence_output_hi = self.dropout(sequence_output_hi,
                                       training=kwargs.get("training", False))
        sequence_output = Concatenate()([sequence_output_en, sequence_output_hi])
        slot_logits = self.slot_classifier(sequence_output)

        # pooled_output for intent classification
        pooled_output_en = self.dropout(pooled_output_en,
                                     training=kwargs.get("training", False))
        pooled_output_hi = self.dropout(pooled_output_hi,
                                     training=kwargs.get("training", False))
        pooled_output = Concatenate()([pooled_output_en, pooled_output_hi])
        intent_logits = self.intent_classifier(pooled_output)
        
        return slot_logits, intent_logits

In [231]:
from tensorflow.keras.losses  import BinaryCrossentropy

In [232]:
joint_model = JointIntentAndSlotFillingModel(
    intent_num_labels=len(intents), slot_num_labels=len(slot_map))

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you

### Hyperparams, Optimizer and Loss function

In [233]:
opt = Adam(learning_rate=3e-5, epsilon=1e-08)

# two outputs, one for slots, another for intents
# we have to fine tune for both
losses = [SparseCategoricalCrossentropy(from_logits=True),
          BinaryCrossentropy(from_logits=True)]

metrics = ["accuracy"]
# compile model
joint_model.compile(optimizer=opt, loss=losses, metrics=metrics)

### Train

In [234]:
x = {"input_ids": encoded_texts["input_ids"], "token_type_ids": encoded_texts["token_type_ids"],  "attention_mask": encoded_texts["attention_mask"]}

In [235]:
history = joint_model.fit(
    x, (encoded_slots_array, encoded_intents), epochs=1, batch_size=16, shuffle=True, verbose = 2)

KeyboardInterrupt: ignored

## Inference

In [236]:
def nlu(text, tokenizer, model, intent_names, slot_names):
    inputs = tf.constant(tokenizer.encode(text))[None, :]  # batch_size = 1
    outputs = model(inputs)
    slot_logits, intent_logits = outputs
    slot_ids = slot_logits.numpy().argmax(axis=-1)[0, :]
    #print(slot_ids)
    np_intents = np.array(intent_logits.numpy()[0])
    temp =[]
    for i in range(len(np_intents)):
      if(np_intents[i] > 0.75): #Threshold Value
        temp.append(intents[i])
    intents_string = "#".join(i for i in temp)
  
    info = {"intent": intents_string, "slots": {}}

    out_dict = {}
    # get all slot names and add to out_dict as keys
    predicted_slots = [slot_names[s] for s in slot_ids]
    info['slots'] = predicted_slots
    # for ps in predicted_slots:
    #   out_dict[ps] = []

    # # check if the text starts with a small letter
    # if text[0].islower():
    #   tokens = tokenizer.tokenize(text, add_special_tokens=True)
    # else:
    #   tokens = tokenizer.tokenize(text)
    # for token, slot_id in zip(tokens, slot_ids):
    #     # add all to out_dict
    #     slot_name = slot_names[slot_id]

    #     # if slot_name == "<PAD>":
    #     #     continue

    #     # collect tokens
    #     collected_tokens = [token]
    #     idx = tokens.index(token)

    #     # see if it starts with ##
    #     # then it belongs to the previous token
    #     if token.startswith("##"):
    #       # check if the token already exists or not
    #       if tokens[idx - 1] not in out_dict[slot_name]:
    #         collected_tokens.insert(0, tokens[idx - 1])

    #     # add collected tokens to slots
    #     out_dict[slot_name].extend(collected_tokens)

    # # process out_dict
    # for slot_name in out_dict:
    #     tokens = out_dict[slot_name]
    #     slot_value = tokenizer.convert_tokens_to_string(tokens)

    #     info["slots"][slot_name] = slot_value.strip()

    return info


In [242]:
nlu("cleveland to kansas city ke lie somavaar 3 pm se pahale pahunchen", tokenizer, joint_model, 
    intent_names, slot_names)

{'intent': 'airport#restriction#aircraft+flight+flight_no#ground_fare',
 'slots': ['B-stoploc.city_name',
  'B-return_date.day_number',
  'I-flight_stop',
  'B-fare_amount',
  'B-return_date.day_number',
  'B-toloc.country_name',
  'B-return_date.day_number',
  '-depart_time.time_relative',
  'B-flight_number',
  'I-class_type',
  'I-toloc.state_name',
  'B-depart_date.month_name',
  'I-toloc.state_name',
  'I-toloc.city_name',
  'B-stoploc.city_name',
  'I-city_nameTranslation',
  'I-city_nameTranslation',
  'B-depart_time.period_of_day',
  'B-fromloc.state_code',
  'I-toloc.state_name',
  'B-depart_date.today_relative',
  'B-depart_date.day_number',
  'B-meal_code',
  'I-toloc.state_name',
  'I-arrive_time.end_time',
  'B-depart_date.day_number',
  'B-fromloc.state_code']}

In [None]:
df_test_prediction = pd.DataFrame(list(df_test['code_mixed_sentence'].apply(lambda x:nlu(x, tokenizer, joint_model, 
    intent_names, slot_names))))

In [241]:
from sklearn.metrics import classification_report

print(classification_report( list(df_test['intents']), list(df_test_prediction['intent'])))

ValueError: ignored

In [187]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [188]:
from seqeval.metrics import precision_score, recall_score, f1_score

def get_slot_metrics(preds, labels):
    assert len(preds) == len(labels)
    
    return {
        "slot_precision": precision_score(labels, preds),
        "slot_recall": recall_score(labels, preds),
        "slot_f1": f1_score(labels, preds)
    }



In [192]:
res = []
for i in df_test_prediction['slots']:
  res.append(i)
# print(res, [i.split() for i in list(df_test['code_mixed_slots'])])
temp = len(res[0]) 
arr = []
for i in list(df_test['code_mixed_slots']):
  temp1 = i.split()
  if(temp - len(temp1) > 0):
    for j in range(temp - len(temp1)):
      temp1.append('<PAD>')
  arr.append(temp1)
#get_slot_metrics(res, [i.split() for i in list(df_test['code_mixed_slots'])])
get_slot_metrics(res, arr)



{'slot_precision': 0.0, 'slot_recall': 0.0, 'slot_f1': 0.0}