In [1]:
#clone the dataset repo
!git clone https://github.com/clinc/oos-eval.git
%cd oos-eval

Cloning into 'oos-eval'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 64 (delta 33), reused 48 (delta 19), pack-reused 0[K
Unpacking objects: 100% (64/64), done.
/content/oos-eval


In [2]:
#import necessary packages

import json
from random import randint
import numpy as np
import nltk
import string
import re
import os
import pandas as pd
from sklearn.utils import shuffle
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout,BatchNormalization
from keras.callbacks import ModelCheckpoint
from random import seed

nltk.download('wordnet')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
#read the json file
current_directory = os.getcwd()
file_directory = current_directory+'/data/data_full.json'

with open(file_directory) as f:
  data = json.load(f)

#classes in the json file
for key in data.keys():
  print(key)

oos_val
val
train
oos_test
test
oos_train


In [4]:
#seed random number generator
#seed(1)

random_20_classes = []
i = 0
# generate 20 random integer values
while i!=20:
  value = randint(0, 149)
  if value not in random_20_classes:
    random_20_classes.append(value)
    i = i+1

In [5]:
#get the data for train, test and val
train_data = {'text':[],'intent':[]}
test_data = {'text':[],'intent':[]}
val_data = {'text':[],'intent':[]}

for ele in random_20_classes:
  for i in range(ele*100,ele*100+100):
    train_data['text'].append(data['train'][i][0])
    train_data['intent'].append(data['train'][i][1])
  for i in range(ele*30,ele*30+30):
    test_data['text'].append(data['test'][i][0])
    test_data['intent'].append(data['test'][i][1])
  for i in range(ele*20,ele*20+20):
    val_data['text'].append(data['val'][i][0])
    val_data['intent'].append(data['val'][i][1])

In [6]:
#load dataset as X and Y values 
def load_dataset(load_data):
  load_data = pd.DataFrame(load_data)
  X = load_data['text']
  Y = load_data['intent']
  unique_intent = list(set(Y))
  return (X,Y,unique_intent)

#text cleaning step - remove punctuation, tokenize and lemmatize
def cleaning(sentences):
  words = []
  for s in sentences:
    clean = "".join([i.lower() for i in s if i not in string.punctuation])
    w = word_tokenize(clean)
    #lemmatizing
    words.append([lemmatizer.lemmatize(i) for i in w])
  return words

#create encodings
def create_tokenizer(words, oov,filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  if oov:
    token = Tokenizer(filters = filters,oov_token='OOV')
  else:
    token = Tokenizer(filters=filters)
  token.fit_on_texts(words)
  return token

#getting maximum length
def max_length_fn(words):
  return(len(max(words, key = len)))

#encoding list of words
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

#pad the sequence
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length,padding = "post"))

#one hot encoding
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [7]:
# call load dataset to create dataset
X_train,Y_train,unique_intent = load_dataset(train_data)
X_test,Y_test,_ = load_dataset(test_data)
X_val,Y_val,_ = load_dataset(val_data)

In [8]:
#20 distinct intents
print(unique_intent)

['transactions', 'order', 'date', 'insurance', 'lost_luggage', 'change_volume', 'yes', 'how_busy', 'direct_deposit', 'next_song', 'shopping_list', 'insurance_change', 'rewards_balance', 'time', 'what_are_your_hobbies', 'how_old_are_you', 'book_hotel', 'are_you_a_bot', 'next_holiday', 'order_status']


In [9]:
#clean train and val
Xtrain_clean = cleaning(X_train)
Xval_clean = cleaning(X_val)

In [10]:
#tokenizer
word_tokenizer = create_tokenizer(Xtrain_clean,oov=True)
#vocab_size
vocab_size = len(word_tokenizer.word_index) + 1
#max_length calculation
max_length = max_length_fn(Xtrain_clean)
print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 1222 and Maximum length = 28


In [11]:
#encode train and val sentences
encoded_input_train = encoding_doc(word_tokenizer, Xtrain_clean)
encoded_val = encoding_doc(word_tokenizer,Xval_clean)
#pad train and val sentences
padded_train = padding_doc(encoded_input_train, max_length)
padded_val = padding_doc(encoded_val,max_length)

In [12]:
#create encodings for output intent class
output_tokenizer = create_tokenizer(unique_intent, oov = False,filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')
print(output_tokenizer.word_index)

{'transactions': 1, 'order': 2, 'date': 3, 'insurance': 4, 'lost_luggage': 5, 'change_volume': 6, 'yes': 7, 'how_busy': 8, 'direct_deposit': 9, 'next_song': 10, 'shopping_list': 11, 'insurance_change': 12, 'rewards_balance': 13, 'time': 14, 'what_are_your_hobbies': 15, 'how_old_are_you': 16, 'book_hotel': 17, 'are_you_a_bot': 18, 'next_holiday': 19, 'order_status': 20}


In [13]:
#encode output labels for train and val
encoded_output_train = encoding_doc(output_tokenizer, Y_train)
encoded_output_val = encoding_doc(output_tokenizer,Y_val)

#reshape vector
encoded_output_train = np.array(encoded_output_train).reshape(len(encoded_output_train), 1)
encoded_output_val = np.array(encoded_output_val).reshape(len(encoded_output_val), 1)

#one hot encode Y for train and val
output_onehot_train = one_hot(encoded_output_train)
output_onehot_val = one_hot(encoded_output_val)

In [14]:
#create model
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 128, input_length = max_length))
  model.add(Bidirectional(LSTM(128)))
  model.add(Dense(64, activation = "relu"))
  model.add(Dropout(0.4))
  model.add(Dense(20, activation = "softmax"))
  return model

model = create_model(vocab_size, max_length)
#optimizer, metrics, and loss
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()
#checkpoint for model
checkpoint = ModelCheckpoint('intent_classification_model.h5', verbose=0, save_best_only=True, mode='auto',save_freq='epoch')
#shuffle data
train_X, train_Y = shuffle(padded_train,output_onehot_train)
val_X,val_Y = shuffle(padded_val,output_onehot_val)

#model training
model.fit(train_X, train_Y, epochs = 10, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 28, 128)           156416    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               263168    
_________________________________________________________________
dense (Dense)                (None, 64)                16448     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                1300      
Total params: 437,332
Trainable params: 437,332
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f61c0353470>

In [15]:
#load best model
model = load_model("intent_classification_model.h5")
 
#calculate predictions 
def predictions(text):
  test_word = cleaning(text)
  test_ls = encoding_doc(word_tokenizer,test_word)
  x = padding_doc(test_ls, max_length)
  pred = model.predict(x)
  return pred

#calculate classes
def get_final_output(pred, classes):
  predictions = pred[0]
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
  return classes[0]

#calculate test accuracy
count = 0
for i in range(len(X_test)):
  pred = predictions([X_test[i]])
  cls = get_final_output(pred, unique_intent)
  if cls==Y_test[i]:
    count=count+1
print('test accuracy ',count/len(Y_test))

test accuracy  0.9333333333333333


**Describe the process involved with deciding on the input feature representation, the model architecture, the training-related parameters, the performance evaluation metric(s), etc. Also, discuss challenges, simplifications made, and future work.**

**Ans:**

1. The dataset is read and using random number generator, random numbers are generated to choose 20 distinct in scope intent classes.

2. The respective train, val, and test data is retrieved for 20 different in scope intent classes.

3. This is a supervised learning task with text as data and intent as labels.

4. There are 100 data in train, 20 in validation and 30 in test for each in scope intent classes.

5. The data is split into X and Y based on text and labels for each class.

6. Then the text data is cleaned:

  6.1 The punctuations are removed first and converted to lower case.

  6.2 The remaining words are the tokenized.

  6.3 The tokenized words are then lemmatized.


  The reason for going for lemmatizing than stemming is, since being an intent classification class the meaning of words are really important. When these are stemmed then there is a high probability for the meaning of the word to change. In this task stopwords are not removed, being a small dataset if the stop words are removed the model will not be able to learn more relationships in texts, hence the stopwords are not removed.

7. The cleaned data is then tokenized, that is a dictionary or hash map of numbers is created which is used for encoding the text. This also determines the vocab_size.

   The encodings are generated only for data in the training set and not including the validation or test set. The main purpose of using validation or test set is to see how model performs on the data that it has not seen during the training. Since in most applications the data is split and checked but in this case the split data is given on hand and hence the val and test data is used for evaluation purpose alone.

8. Then the sentence texts are encoded and out of vocabulary words are not omitted.

9. Then the encodings are padded with zero at the end to match the maximum length of sequence of training data.

10. The intent classes are then tokenized and assigned encoding values. Then they are one hot encoded.

11. After the data for X and Y is prepared, the next step is model creation.

12. Being a time constraint assessment, I created a model with one Bidirectional LSTM layer, one dense layer and one final softmax layer with 20 units. Dropout layer is added to avoid overfitting of the model.

13. The model is trained and optimized with Adam optimizer, the loss is categorical crossentropy with the metrics 'accuracy'.

14. The model is trained for 10 epochs, with batch size 32 and only the best model is saved.

15. Then the saved best model is then used to estimate accuracy of the test set.


**Challenges:**
1. The main challenge in this task is to decide what kind of preprocessing steps needs to be employed.
2. Being a small dataset effective model design and hyperparamets are necessary.

**Simplification made**
1. Being a time constrained task, I spent a major time in cleaning the dataset and getting the data in the right format. Hence, I had to compromise on going with a simple model and not a deep or complex model.
2. Being a small model, CPU alone was neede to train the model.

**Future Work**
1. Include all the data for 150 classes and remove stopwords to minimize the vocab size.
2. Use attention models to learn embeddings and perform classification task which will give a better result once the stopwords are removed. 
3. Use deep model to learn more relationships among words.
