# Intent Classification 

## Clinc 150 Dataset from UCI
[An evaluation datset for intent classification and out-of-scope prediction.](https://archive.ics.uci.edu/ml/datasets/CLINC150)

**Citation**
* Stefan Larson, Anish Mahendran, Joseph J. Peper, Christopher Clarke, Andrew Lee, Parker Hill, Jonathan K. Kummerfeld, Kevin Leach, Michael A. Laurenzano, Lingjia Tang, and Jason Mars. 2019. An evaluation dataset for intent classification and out-of-scope prediction. In Proceedings of EMNLP-IJCNLP

### import dependencies

In [1]:
import os
import json
import pickle
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

In [36]:
import nltk
from  nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

In [37]:
stop_words = stopwords.words('english')
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()

### Load data

In [4]:
path = '../data/clinc150_uci/data_full.json'

In [5]:
with open(path) as file:
    data = json.load(file)

In [6]:
data.keys()

dict_keys(['oos_val', 'val', 'train', 'oos_test', 'test', 'oos_train'])

#### Collect texts and labels

In [7]:
train_texts = []
train_labels = []
val_texts = []
val_labels = []
test_texts = []
test_labels = []

In [8]:
# collect texts and labels in train and oos_train set
for item in data['train']:
    train_texts.append(item[0])
    train_labels.append(item[-1])
# collect texts and labels in oos-train set
for item in data['oos_train']:
    train_texts.append(item[0])
    train_labels.append(item[-1])

In [9]:
# collect texts and labels in val and oos_val set
for item in data['val']:
    val_texts.append(item[0])
    val_labels.append(item[-1])
# collect texts and labels in oos-train set
for item in data['oos_val']:
    val_texts.append(item[0])
    val_labels.append(item[-1])

In [10]:
# collect texts and labels in test and oos_test set
for item in data['test']:
    test_texts.append(item[0])
    test_labels.append(item[-1])
# collect texts and labels in oos-train set
for item in data['oos_test']:
    test_texts.append(item[0])
    test_labels.append(item[-1])

In [11]:
# collect unique labels
label_set = list(set(train_labels + val_labels + test_labels))
label_set

['pto_request',
 'application_status',
 'gas',
 'direct_deposit',
 'insurance',
 'reminder',
 'make_call',
 'bill_due',
 'interest_rate',
 'sync_device',
 'report_fraud',
 'no',
 'spelling',
 'translate',
 'thank_you',
 'gas_type',
 'calculator',
 'time',
 'todo_list',
 'travel_notification',
 'insurance_change',
 'routing',
 'jump_start',
 'traffic',
 'credit_score',
 'share_location',
 'balance',
 'measurement_conversion',
 'travel_alert',
 'goodbye',
 'book_flight',
 'expiration_date',
 'pin_change',
 'income',
 'oil_change_when',
 'order_status',
 'change_volume',
 'taxes',
 'find_phone',
 'account_blocked',
 'tell_joke',
 'tire_pressure',
 'repeat',
 'report_lost_card',
 'timer',
 'rollover_401k',
 'credit_limit',
 'freeze_account',
 'definition',
 'schedule_meeting',
 'vaccines',
 'mpg',
 'pay_bill',
 'next_holiday',
 'fun_fact',
 'distance',
 'order_checks',
 'shopping_list_update',
 'restaurant_reviews',
 'international_fees',
 'pto_balance',
 'rewards_balance',
 'who_do_you_wo

In [12]:
len(label_set)

151

### Label Preprocessing

In [13]:
# encode categories

# initializer label encoder
le = LabelEncoder()
le.fit(label_set)

# encode labels
train_y = le.transform(train_labels)
val_y = le.transform(val_labels)
test_y = le.transform(test_labels)

### Approach 1: TF-IDF Feature Extraction

#### Term-Frequency & Inverse-Document-Frequency

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [58]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = tokenizer.tokenize, analyzer = 'word', stop_words = 'english').fit(train_texts + val_texts + test_texts)



In [59]:
len(tfidf_vectorizer.vocabulary_)

7022

In [60]:
train_X = tfidf_vectorizer.transform(train_texts)
val_X = tfidf_vectorizer.transform(val_texts)
test_X = tfidf_vectorizer.transform(test_texts)

In [61]:
print(train_X.shape)

(15100, 7022)


#### Training models

##### SVM

In [47]:
# initializer SVM classifier
svm = SVC(C = 2, class_weight = 'balanced', kernel = 'rbf', random_state = 97, gamma = 'scale')

In [48]:
# training SVM
svm.fit(train_X, train_y)

SVC(C=2, class_weight='balanced', random_state=97)

In [49]:
svm.score(train_X, train_y)

0.9794701986754967

In [50]:
svm.score(val_X, val_y)

0.817741935483871

In [51]:
svm.score(test_X, test_y)

0.8016363636363636

##### MLP

In [52]:
from sklearn.neural_network import MLPClassifier

In [62]:
classifier = MLPClassifier(hidden_layer_sizes = (256, 151), activation = 'relu', solver = 'sgd',
                           batch_size = 32, shuffle = True, random_state = 97, learning_rate = 'adaptive',
                           verbose = True, early_stopping = True)

In [63]:
classifier.fit(train_X, train_y)

Iteration 1, loss = 5.02309069
Validation score: 0.003311
Iteration 2, loss = 5.02050159
Validation score: 0.003311
Iteration 3, loss = 5.01809208
Validation score: 0.003311
Iteration 4, loss = 5.01568883
Validation score: 0.003311
Iteration 5, loss = 5.01323725
Validation score: 0.003311
Iteration 6, loss = 5.01067835
Validation score: 0.003311
Iteration 7, loss = 5.00798457
Validation score: 0.003974
Iteration 8, loss = 5.00516358
Validation score: 0.005960
Iteration 9, loss = 5.00214277
Validation score: 0.006623
Iteration 10, loss = 4.99894906
Validation score: 0.011258
Iteration 11, loss = 4.99557059
Validation score: 0.012583
Iteration 12, loss = 4.99199354
Validation score: 0.017219
Iteration 13, loss = 4.98822679
Validation score: 0.020530
Iteration 14, loss = 4.98425790
Validation score: 0.025828
Iteration 15, loss = 4.98006914
Validation score: 0.033113
Iteration 16, loss = 4.97567042
Validation score: 0.041060
Iteration 17, loss = 4.97104261
Validation score: 0.044371
Iterat

Iteration 140, loss = 0.43686811
Validation score: 0.840397
Iteration 141, loss = 0.43108265
Validation score: 0.834437
Iteration 142, loss = 0.42424408
Validation score: 0.840397
Iteration 143, loss = 0.41928789
Validation score: 0.847020
Iteration 144, loss = 0.41282678
Validation score: 0.845033
Iteration 145, loss = 0.40711436
Validation score: 0.839073
Iteration 146, loss = 0.40136788
Validation score: 0.845695
Iteration 147, loss = 0.39582094
Validation score: 0.841722
Iteration 148, loss = 0.39059670
Validation score: 0.845033
Iteration 149, loss = 0.38542384
Validation score: 0.844371
Iteration 150, loss = 0.38016470
Validation score: 0.847682
Iteration 151, loss = 0.37549487
Validation score: 0.849669
Iteration 152, loss = 0.37020021
Validation score: 0.846358
Iteration 153, loss = 0.36613267
Validation score: 0.851656
Iteration 154, loss = 0.36021942
Validation score: 0.847020
Iteration 155, loss = 0.35665901
Validation score: 0.844371
Iteration 156, loss = 0.35203401
Validat



MLPClassifier(batch_size=32, early_stopping=True, hidden_layer_sizes=(256, 151),
              learning_rate='adaptive', random_state=97, solver='sgd',
              verbose=True)

In [64]:
classifier.score(val_X, val_y)

0.8041935483870968

##### AdaBoost Classifier

In [186]:
from sklearn.ensemble import GradientBoostingClassifier

In [189]:
# initializer Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(learning_rate = 0.01, n_iter_no_change = 10, verbose = 1)

In [190]:
# train gb_classifier
gb_classifier.fit(train_X, train_y)

      Iter       Train Loss   Remaining Time 
         1           3.9994           12.71m
         2           3.7270           13.15m
         3           3.5334           13.10m
         4           3.3805           12.90m
         5           3.2546           12.78m
         6           3.1468           12.69m
         7           3.0531           12.53m
         8           2.9653           12.36m
         9           2.8891           12.20m
        10           2.8169           12.04m
        20           2.3170           10.60m
        30           2.0055            9.24m
        40           1.7720            7.91m
        50           1.5912            6.60m
        60           1.4391            5.27m
        70           1.3104            3.95m
        80           1.1980            2.63m
        90           1.1021            1.31m
       100           1.0179            0.00s


GradientBoostingClassifier(learning_rate=0.01, n_iter_no_change=10, verbose=1)

In [191]:
gb_classifier.score(val_X, val_y)

0.7316129032258064

In [192]:
gb_classifier.score(test_X, test_y)

0.6163636363636363

### Save model and intent list

In [25]:
# save model
filename = '../intent_classifier.sav'
pickle.dump(svm, open(filename, 'wb'))

In [26]:
# append new_line character
labels = [label+ '\n' for label in label_set]
labels[-1] = labels[-1].strip('\n')

# save intent list
with open('../intent_list.txt', 'w') as file:
    file.writelines(labels)
    
# save vocabs
with open('../vocabs.pickle', 'wb') as file:
    vocabs = tfidf_vectorizer.vocabulary_
    pickle.dump(vocabs, file)
    
# save vectorizer
with open('../tfidf_vectorizer.pickle', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)