# Intent Classification 

## Clinc 150 Dataset from UCI
[An evaluation datset for intent classification and out-of-scope prediction.](https://archive.ics.uci.edu/ml/datasets/CLINC150)

**Citation**
* Stefan Larson, Anish Mahendran, Joseph J. Peper, Christopher Clarke, Andrew Lee, Parker Hill, Jonathan K. Kummerfeld, Kevin Leach, Michael A. Laurenzano, Lingjia Tang, and Jason Mars. 2019. An evaluation dataset for intent classification and out-of-scope prediction. In Proceedings of EMNLP-IJCNLP

#### import dependencies

In [1]:
import os
import json
import pickle
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

In [2]:
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [3]:
tokenizer = RegexpTokenizer(r'\w+')

#### load data

In [4]:
path = '../data/clinc150_uci/data_full.json'

In [5]:
with open(path) as file:
    data = json.load(file)

In [6]:
data.keys()

dict_keys(['oos_val', 'val', 'train', 'oos_test', 'test', 'oos_train'])

#### collect texts and labels

In [7]:
train_texts = []
train_labels = []
val_texts = []
val_labels = []
test_texts = []
test_labels = []

In [17]:
# collect texts and labels in train and oos_train set
for item in data['train']:
    train_texts.append(item[0])
    train_labels.append(item[-1])
# collect texts and labels in oos-train set
for item in data['oos_train']:
    train_texts.append(item[0])
    train_labels.append(item[-1])

In [18]:
# collect texts and labels in val and oos_val set
for item in data['val']:
    val_texts.append(item[0])
    val_labels.append(item[-1])
# collect texts and labels in oos-train set
for item in data['oos_val']:
    val_texts.append(item[0])
    val_labels.append(item[-1])

In [19]:
# collect texts and labels in test and oos_test set
for item in data['test']:
    test_texts.append(item[0])
    test_labels.append(item[-1])
# collect texts and labels in oos-train set
for item in data['oos_test']:
    test_texts.append(item[0])
    test_labels.append(item[-1])

In [20]:
# collect unique labels
label_set = list(set(train_labels + val_labels + test_labels))
label_set

['order_checks',
 'text',
 'recipe',
 'application_status',
 'routing',
 'card_declined',
 'update_playlist',
 'tell_joke',
 'change_language',
 'report_lost_card',
 'uber',
 'reminder_update',
 'do_you_have_pets',
 'timer',
 'translate',
 'credit_limit',
 'cancel',
 'change_volume',
 'order_status',
 'gas',
 'book_hotel',
 'taxes',
 'mpg',
 'current_location',
 'directions',
 'last_maintenance',
 'who_do_you_work_for',
 'change_ai_name',
 'schedule_meeting',
 'share_location',
 'confirm_reservation',
 'what_is_your_name',
 'how_old_are_you',
 'meaning_of_life',
 'calories',
 'whisper_mode',
 'restaurant_reservation',
 'pto_balance',
 'interest_rate',
 'smart_home',
 'flight_status',
 'reminder',
 'repeat',
 'travel_alert',
 'alarm',
 'w2',
 'definition',
 'pin_change',
 'ingredients_list',
 'cancel_reservation',
 'insurance',
 'apr',
 'rewards_balance',
 'time',
 'insurance_change',
 'international_visa',
 'shopping_list',
 'tire_change',
 'restaurant_suggestion',
 'transactions',
 'r

In [21]:
len(label_set)

151

#### Text Preprocessing

In [22]:
# encode categories

# initializer label encoder
le = LabelEncoder()
le.fit(label_set)

# encode labels
train_y = le.transform(train_labels)
val_y = le.transform(val_labels)
test_y = le.transform(test_labels)

##### Text Feature Extraction

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = tokenizer.tokenize, analyzer = 'word', stop_words = 'english').fit(train_texts + val_texts + test_texts)

In [25]:
train_X = tfidf_vectorizer.transform(train_texts)
val_X = tfidf_vectorizer.transform(val_texts)
test_X = tfidf_vectorizer.transform(test_texts)

In [26]:
print(train_X.shape)

(30100, 7022)


#### SVM

In [27]:
# initializer SVM classifier
svm = SVC(C = 2, class_weight = 'balanced')

In [28]:
# training SVM
svm.fit(train_X, train_y)

SVC(C=2, class_weight='balanced')

In [29]:
svm.score(train_X, train_y)

0.9821262458471761

In [30]:
svm.score(val_X, val_y)

0.8141935483870968

In [31]:
svm.score(test_X, test_y)

0.7987272727272727

#### AdaBoost Classifier

In [186]:
from sklearn.ensemble import GradientBoostingClassifier

In [189]:
# initializer Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(learning_rate = 0.01, n_iter_no_change = 10, verbose = 1)

In [190]:
# train gb_classifier
gb_classifier.fit(train_X, train_y)

      Iter       Train Loss   Remaining Time 
         1           3.9994           12.71m
         2           3.7270           13.15m
         3           3.5334           13.10m
         4           3.3805           12.90m
         5           3.2546           12.78m
         6           3.1468           12.69m
         7           3.0531           12.53m
         8           2.9653           12.36m
         9           2.8891           12.20m
        10           2.8169           12.04m
        20           2.3170           10.60m
        30           2.0055            9.24m
        40           1.7720            7.91m
        50           1.5912            6.60m
        60           1.4391            5.27m
        70           1.3104            3.95m
        80           1.1980            2.63m
        90           1.1021            1.31m
       100           1.0179            0.00s


GradientBoostingClassifier(learning_rate=0.01, n_iter_no_change=10, verbose=1)

In [191]:
gb_classifier.score(val_X, val_y)

0.7316129032258064

In [192]:
gb_classifier.score(test_X, test_y)

0.6163636363636363

### Save model and intent list

In [32]:
# save model
filename = '../intent_classifier.sav'
pickle.dump(svm, open(filename, 'wb'))

In [42]:
# append new_line character
labels = [label+ '\n' for label in label_set]
labels[-1] = labels[-1].strip('\n')

# save intent list
with open('../intent_list.txt', 'w') as file:
    file.writelines(labels)
    
# save vocabs
with open('../vocabs.pickle', 'wb') as file:
    vocabs = tfidf_vectorizer.vocabulary_
    pickle.dump(vocabs, file)
    
# save vectorizer
with open('../tfidf_vectorizer.pickle', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)