In [1]:
#clone the dataset repo
!git clone https://github.com/clinc/oos-eval.git
%cd oos-eval

Cloning into 'oos-eval'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 64 (delta 33), reused 48 (delta 19), pack-reused 0[K
Unpacking objects: 100% (64/64), done.
/content/oos-eval


In [2]:
#import necessary packages

import json
from random import randint
import numpy as np
import nltk
import string
import re
import pandas as pd
from sklearn.utils import shuffle
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout,BatchNormalization
from keras.callbacks import ModelCheckpoint
from random import seed
import os

nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
#class definition
class IntentClassifier:
  def __init__(self): #constructor
    self.data = {}
    self.random_20_classes = []
    self.train_data = {'text':[],'intent':[]}
    self.test_data = {'text':[],'intent':[]}
    self.val_data = {'text':[],'intent':[]}
    self.X_train = None
    self.X_test = None
    self.X_val = None 
    self.Y_train = None
    self.Y_test = None
    self.Y_val = None
    self.unique_intent = []
    self.lemmatizer = WordNetLemmatizer()
    self.word_tokenizer = Tokenizer()
    self.Xtrain_clean = []
    self.Xval_clean = []
    self.vocab_size = 0
    self.max_length = 0
    self.padded_train = None
    self.padded_val = None
    self.encoded_input_train = None
    self.encoded_val = None
    self.output_tokenizer = None
    self.encoded_output_train = None
    self.encoded_output_val = None
    self.output_onehot_train = None
    self.output_onehot_val = None
    self.model = None
    self.best_model_name = 'intent_classification_model.h5'
  
  #read the json file
  def read_file(self,filename):
    current_directory = os.getcwd()
    file_directory = current_directory+'/'+filename
    with open(file_directory) as f:
      self.data = json.load(f)
    print('keys :')
    #classes in the json file
    for key in self.data.keys():
      print(key)

  #load dataset as X and Y values 
  def load_dataset(self,load_data):
    load_data = pd.DataFrame(load_data)
    X = load_data['text']
    Y = load_data['intent']
    unique_intent = list(set(Y))
    return (X,Y,unique_intent) 
  
  #text cleaning step - remove punctuation, tokenize and lemmatize
  def cleaning(self,sentences):
    words = []
    for s in sentences:
      clean = "".join([i.lower() for i in s if i not in string.punctuation])
      w = word_tokenize(clean)
      #lemmatizing
      words.append([self.lemmatizer.lemmatize(i) for i in w])
    return words

  #create encodings
  def create_tokenizer(self,words,oov,filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    if oov:
      token = Tokenizer(filters = filters, oov_token='OOV')
    else:
      token = Tokenizer(filters = filters)
    token.fit_on_texts(words)
    return token
  
  #getting maximum length
  def max_length_func(self,words):
    return(len(max(words, key = len)))
  
  #encoding list of words
  def encoding_doc(self, token, words):
    return(token.texts_to_sequences(words))

  #pad the sequence  
  def padding_doc(self,encoded_doc):
    return(pad_sequences(encoded_doc, maxlen = self.max_length,padding = "post"))

  #one hot encoding 
  def one_hot(self,encode):
    o = OneHotEncoder(sparse = False)
    return(o.fit_transform(encode))
  
  #data preparation
  def data_preparation(self,filename):
    print('Data preparation begins')
    #seed random number generator
    seed(1)
    i = 0
    self.read_file(filename)

    # generate 20 random integer values
    while i!=20:
      value = randint(0, 149)
      if value not in self.random_20_classes:
        self.random_20_classes.append(value)
        i = i+1

    #get the data for train, test and val
    for ele in self.random_20_classes:
      for i in range(ele*100,ele*100+100):
        self.train_data['text'].append(self.data['train'][i][0])
        self.train_data['intent'].append(self.data['train'][i][1])
      for i in range(ele*30,ele*30+30):
        self.test_data['text'].append(self.data['test'][i][0])
        self.test_data['intent'].append(self.data['test'][i][1])
      for i in range(ele*20,ele*20+20):
        self.val_data['text'].append(self.data['val'][i][0])
        self.val_data['intent'].append(self.data['val'][i][1])
    
    # call load dataset to create dataset
    self.X_train,self.Y_train,self.unique_intent = self.load_dataset(self.train_data)
    self.X_test,self.Y_test,_ = self.load_dataset(self.test_data)
    self.X_val,self.Y_val,_ = self.load_dataset(self.val_data)

    #clean train and val
    self.Xtrain_clean = self.cleaning(self.X_train)
    self.Xval_clean  = self.cleaning(self.X_val)

    #tokenizer
    self.word_tokenizer = self.create_tokenizer(self.Xtrain_clean,oov=True)

    #vocab_size
    self.vocab_size = len(self.word_tokenizer.word_index)+1
    
    #max_length calculation
    self.max_length = self.max_length_func(self.Xtrain_clean)

    #encode train and val sentences
    self.encoded_input_train = self.encoding_doc(self.word_tokenizer,self.Xtrain_clean)
    self.encoded_val = self.encoding_doc(self.word_tokenizer,self.Xval_clean)

    #pad train and val sentences
    self.padded_train = self.padding_doc(self.encoded_input_train)
    self.padded_val = self.padding_doc(self.encoded_val)
    
    #create encodings for output intent class
    self.output_tokenizer = self.create_tokenizer(self.unique_intent, oov=False,filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')
    print(self.output_tokenizer.word_index)

    #encode output labels for train and val
    self.encoded_output_train = self.encoding_doc(self.output_tokenizer, self.Y_train)
    self.encoded_output_val = self.encoding_doc(self.output_tokenizer,self.Y_val)

    #reshape vector
    self.encoded_output_train = np.array(self.encoded_output_train).reshape(len(self.encoded_output_train), 1)
    self.encoded_output_val = np.array(self.encoded_output_val).reshape(len(self.encoded_output_val), 1)

    #one hot encode Y for train and val
    self.output_onehot_train = self.one_hot(self.encoded_output_train)
    self.output_onehot_val = self.one_hot(self.encoded_output_val)
  
  #create model
  def create_model(self):
    print('Model Created')
    model = Sequential()
    model.add(Embedding(self.vocab_size, 128, input_length = self.max_length))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.4))
    model.add(Dense(20, activation = "softmax"))
    return model

  def train_model(self,filename):
    self.data_preparation(filename)
    self.model = self.create_model()
    #optimizer, metrics, and loss
    self.model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
    self.model.summary()
    #checkpoint for model
    checkpoint = ModelCheckpoint(self.best_model_name, verbose=0, save_best_only=True, mode='auto',save_freq='epoch')

    #shuffle data
    train_X, train_Y = shuffle(self.padded_train,self.output_onehot_train)
    val_X,val_Y = shuffle(self.padded_val,self.output_onehot_val)
    print('Training begins')

    #model training
    self.model.fit(train_X, train_Y, epochs = 10, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])
    
  #calculate predictions 
  def predictions(self,text):
    test_word = self.cleaning(text)
    test_enc = self.encoding_doc(self.word_tokenizer,test_word)
    x = self.padding_doc(test_enc)
    pred = self.model.predict(x)
    return pred
    
  #calculate classes
  def get_final_output(self,pred, classes):
    predictions = pred[0]
    classes = np.array(classes)
    ids = np.argsort(-predictions)
    classes = classes[ids]
    predictions = -np.sort(-predictions)
    return classes[0]

  #calculate test accuracy
  def evaluate_test(self):
    #load best model
    self.model = load_model(self.best_model_name)
    count = 0
    for i in range(len(self.X_test)):
      pred = self.predictions([self.X_test[i]])
      cls = self.get_final_output(pred,self.unique_intent)
      if cls==self.Y_test[i]:
        count=count+1
    print('test accuracy', (count/len(self.Y_test)))
  
  #perform demo with one sentence
  def model_demo(self,text):
    #load best model
    self.model = load_model(self.best_model_name)
    pred = self.predictions([text])
    cls = self.get_final_output(pred,self.unique_intent)
    print('intent is :',cls)

In [4]:
#object creation
A = IntentClassifier()

#train model
A.train_model(filename = 'data/data_full.json')

Data preparation begins
keys :
oos_val
val
train
oos_test
test
oos_train
{'vaccines': 1, 'cook_time': 2, 'what_is_your_name': 3, 'account_blocked': 4, 'min_payment': 5, 'translate': 6, 'restaurant_reservation': 7, 'transactions': 8, 'what_are_your_hobbies': 9, 'flight_status': 10, 'no': 11, 'international_fees': 12, 'thank_you': 13, 'confirm_reservation': 14, 'calculator': 15, 'distance': 16, 'travel_alert': 17, 'pto_balance': 18, 'income': 19, 'shopping_list': 20}
Model Created
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 24, 128)           188672    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               263168    
_________________________________________________________________
dense (Dense)                (None, 64)                16448     
__________________________________

In [5]:
#demo with one sentence
A.model_demo('how much time does it take to cook rice?')

intent is : cook_time


In [6]:
#calculate test accuracy
A.evaluate_test()

test accuracy 0.9316666666666666
