## Steps for the LSTM Classifier:
1. Build Vector loader in PyTorch (word2vec, GloVe, Fasttext)
2. Convert sentence (concatenate, other methods?) into embeddings
3. Build simple LSTM classifier and make it train
4. Evaluate results (accuracy, log_loss)

In [1]:
import numpy as np
import pandas as pd

## Torch imports
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Sklearn imports
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, log_loss

# NLP libs imports
from nltk import download
from nltk.corpus import stopwords
import re
import gensim

from pathlib import Path
import pickle
download('stopwords')

Using TensorFlow backend.
  return f(*args, **kwds)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rsilvei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
torch.manual_seed(11)

<torch._C.Generator at 0x104109c90>

In [3]:
train = pd.read_csv('../datasets/train_apiai_8.csv', delimiter=";")
print("Train size: {}".format(len(train)))
train.tail()

Train size: 721


Unnamed: 0,usersays,intent
716,Martha smokes,workerTobaccoUsageStatus.update
717,Bill start using tobacco,workerTobaccoUsageStatus.update
718,Margarite start smoking,workerTobaccoUsageStatus.update
719,Employee quit smoking,workerTobaccoUsageStatus.update
720,Chase quit smoking,workerTobaccoUsageStatus.update


In [4]:
## Filtering out useless classes
train = train[train['intent']!='init.greetings']
train = train[train['intent']!='Default Welcome Intent']
print("Train size: {}".format(len(train)))

Train size: 710


In [5]:
train[train['intent']=='personPersonalPhoneNumber.update']

Unnamed: 0,usersays,intent
567,change my mobile phone number,personPersonalPhoneNumber.update
568,update mobile number,personPersonalPhoneNumber.update
569,modify number,personPersonalPhoneNumber.update
570,adjust phone,personPersonalPhoneNumber.update
571,change phone,personPersonalPhoneNumber.update
572,update number,personPersonalPhoneNumber.update
573,update phone,personPersonalPhoneNumber.update
574,update my phone,personPersonalPhoneNumber.update
575,Update my phone,personPersonalPhoneNumber.update
576,Change personal phone,personPersonalPhoneNumber.update


In [6]:
print(len(np.unique(train['intent'])))

72


In [7]:
def transformText(text, do_stop=False, do_stem=False):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]

    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    if (do_stem==True):
        # Stemming
        text = gensim.parsing.preprocessing.stem_text(text)
    return text

In [8]:
train['usersays_preprocessed']=train['usersays'].apply(lambda x: transformText(x,do_stop=False,do_stem=True))
train.loc[538]

usersays                 change LGBT status
intent                    personLGBT.update
usersays_preprocessed      chang lgbt statu
Name: 538, dtype: object

In [9]:
x_test_1 = pd.read_csv('../datasets/test_intents_1.csv', delimiter=";")
x_test_1 = x_test_1.drop(['EventName', 'Result','Feedback','silveira_v2','error','Intent accuracy', '0.9615384615'], axis=1)
x_test_1=x_test_1.rename(columns={"Intents": "usersays"},)

In [10]:
x_test_1['intent']="personPersonalPhoneNumber.update"
x_test_1.tail()

Unnamed: 0,usersays,intent
67,Update Phone numbers,personPersonalPhoneNumber.update
68,Modify cell,personPersonalPhoneNumber.update
69,modify cellphone,personPersonalPhoneNumber.update
70,modify cell phone number,personPersonalPhoneNumber.update
71,Update,personPersonalPhoneNumber.update


In [11]:
x_test = pd.read_csv('../datasets/test_apiai_6_test.csv', delimiter=";")
x_test.tail()

Unnamed: 0,usersays,intent
185,bulk job requisitions,volumeJobRequisition.add
186,5 job requisitions,volumeJobRequisition.add
187,adjust date birth for Bruno,workerBirthInformation.update
188,correct employee birthdate,workerBirthInformation.update
189,correct Roberto DOB,workerBirthInformation.update


In [12]:
x_test_1.tail()

Unnamed: 0,usersays,intent
67,Update Phone numbers,personPersonalPhoneNumber.update
68,Modify cell,personPersonalPhoneNumber.update
69,modify cellphone,personPersonalPhoneNumber.update
70,modify cell phone number,personPersonalPhoneNumber.update
71,Update,personPersonalPhoneNumber.update


In [13]:
x_test_1

Unnamed: 0,usersays,intent
0,Update phone number,personPersonalPhoneNumber.update
1,Update phone,personPersonalPhoneNumber.update
2,Update phone personal,personPersonalPhoneNumber.update
3,Update mobile,personPersonalPhoneNumber.update
4,Update mobile number,personPersonalPhoneNumber.update
5,Update Phones,personPersonalPhoneNumber.update
6,Update my personal number,personPersonalPhoneNumber.update
7,Update cell phone number,personPersonalPhoneNumber.update
8,Update cell,personPersonalPhoneNumber.update
9,Update my cell phone number,personPersonalPhoneNumber.update


In [14]:
test = x_test.append(x_test_1,ignore_index=True).reset_index()
test.drop(labels=['index'], axis=1, inplace=True)
test

Unnamed: 0,usersays,intent
0,i need to add a new assessment please,assessment.add
1,start new assessment,assessment.add
2,please add a new assessment here,assessment.add
3,make a new assessment,assessment.add
4,send this assessment,assessment.submit
5,finish pre-screening and submit,assessment.submit
6,send assessment,assessment.submit
7,take and assessment,assessment.take
8,i need to answer one assessment,assessment.take
9,Fill out assessment,assessment.take


## Dropping intents that are not in training set
- personalBirthInformation.update  
- scheduleEvaluation.create  
- scheduleEvaluation.update  
- jobRequisition.publish  

In [15]:
len(test)

262

In [16]:
test[test['intent']=='personalBirthInformation.update']

Unnamed: 0,usersays,intent
153,i need to edit DOB,personalBirthInformation.update
154,edit my personal birth date,personalBirthInformation.update
155,adjust date born,personalBirthInformation.update


In [17]:
test = test[test['intent']!='personalBirthInformation.update']
test = test[test['intent']!='scheduleEvaluation.create']
test = test[test['intent']!='scheduleEvaluation.update']
test = test[test['intent']!='jobRequisition.publish']
print("Train size: {}".format(len(test)))

Train size: 249


In [18]:
test['usersays_preprocessed']=test['usersays'].apply(lambda x: transformText(x,do_stop=False,do_stem=True))
test.tail()

Unnamed: 0,usersays,intent,usersays_preprocessed
257,Update Phone numbers,personPersonalPhoneNumber.update,updat phone number
258,Modify cell,personPersonalPhoneNumber.update,modifi cell
259,modify cellphone,personPersonalPhoneNumber.update,modifi cellphon
260,modify cell phone number,personPersonalPhoneNumber.update,modifi cell phone number
261,Update,personPersonalPhoneNumber.update,updat


In [19]:
print(len(np.unique(test['usersays'])))

248


## Persisting train and test datasets

In [250]:
train.to_pickle("train.pkl")
test.to_pickle("test.pkl")

## Train/test split

In [20]:
x_train, x_valid, y_train, y_valid = train_test_split(train['usersays_preprocessed'], train['intent'], test_size = 0, random_state = 4)
print("#" * 20 + " Some stats " + "#"*20)
print("Dataset training: {} uterances".format(x_train.shape[0]))
print("Dataset testing: {} uterances".format(x_valid.shape[0]))
print("Different classes: {}".format(len(set(y_train))))
x_test=np.array(test['usersays_preprocessed'])

#################### Some stats ####################
Dataset training: 710 uterances
Dataset testing: 0 uterances
Different classes: 72


In [21]:
features = CountVectorizer(max_df=0.9, stop_words=None,)
x_train_sgd = features.fit_transform(x_train)
x_valid_sgd = features.transform(x_valid)
x_test_sgd = features.transform(x_test)
y_test = np.array(test['intent'])

In [22]:
list1=list(sorted(set(y_train)))

In [23]:
list2=list(sorted(set(y_test)))

In [24]:
len(list1)

72

In [25]:
len(list2)

57

In [26]:
list3=list(set(list1+list2))
len(list3)

72

In [27]:
for i in list3:
    if i not in list1:
        print(i)

In [28]:
model = SGDClassifier(penalty='l2', loss='log', class_weight= None)
model.fit(x_train_sgd, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [29]:
pred_sgd = model.predict(x_test_sgd)
pred_proba_sgd = model.predict_proba(x_test_sgd)
current_acc = accuracy_score(y_test, pred_sgd)
#logloss = log_loss(y_test, pred_proba_sgd, labels=y_test)
print("Current Accuracy: {0:.3f}".format(current_acc))
#print("Log - loss: {0:.3f}".format(logloss))

Current Accuracy: 0.787


## Testing other classifiers with CNN and Word2vec embeddings

In [30]:
list(x_train[0:5])

['cancel job evalu now pleas',
 'del job board',
 'chang job list',
 'suspend requisit',
 'updat job applic']

In [31]:
list(x_test[0:5])

['need add new assess pleas',
 'start new assess',
 'pleas add new assess here',
 'make new assess',
 'send thi assess']

## Build vocabulary for words

In [32]:
word_2_idx = {}
for sent in list(x_train) + list(x_test):
    for word in sent.split():
        if word not in word_2_idx:
            word_2_idx[word]=len(word_2_idx)

In [33]:
VOCAB_SIZE = len(word_2_idx)
VOCAB_SIZE

346

In [34]:
word_2_idx['lgbt']

162

In [35]:
sorted_vocab = sorted(word_2_idx.keys())
sorted_vocab[-10:-1]

['want',
 'washington',
 'weather',
 'what',
 'where',
 'which',
 'withdraw',
 'worker',
 'yesterdai']

## Build vocabulary for labels

In [98]:
label_2_idx = {}
for label in list(y_train)+list(y_test):
    for word in label.split():
        if word not in label_2_idx:
            label_2_idx[word]=len(label_2_idx)

In [37]:
NUM_LABELS = len(label_2_idx)
NUM_LABELS

72

In [99]:
label_2_idx

{'assessment.add': 62,
 'assessment.submit': 55,
 'assessment.take': 59,
 'assessment.update': 67,
 'autogenerate.match.report': 64,
 'candidate.refer': 40,
 'create.rejection.template': 68,
 'evaluation.cancel': 0,
 'evaluation.review': 60,
 'evaluation.schedule': 11,
 'evaluation.update': 13,
 'externalJobApplication.submit': 20,
 'helper.intents': 9,
 'internalJobApplication.submit': 32,
 'job.create': 51,
 'jobApplication.cancel': 69,
 'jobApplication.evaluate': 45,
 'jobApplication.reject': 42,
 'jobApplication.update': 4,
 'jobApplication.withdraw': 21,
 'jobApplicationInterest.confirm': 24,
 'jobBoard.create': 18,
 'jobBoard.delete': 1,
 'jobBoard.update': 43,
 'jobOffer.create': 54,
 'jobOffer.evaluate': 29,
 'jobOffer.revoke': 50,
 'jobOfferNegotiation.evaluate': 58,
 'jobPosting.cancel': 22,
 'jobPosting.create': 33,
 'jobPosting.update': 2,
 'jobReferral.activate': 53,
 'jobReferral.cancel': 47,
 'jobReferral.evaluate': 63,
 'jobReferral.hold': 31,
 'jobReferral.payment': 36

In [39]:
label_2_idx['job.create']

51

In [40]:
sorted_labels = sorted(label_2_idx.keys())
sorted_labels[-10:-1]

['send.auto_reject_email_chat',
 'view.job',
 'volumeJobRequisition.add',
 'worker.usI9Screening.section1.complete',
 'workerBirthInformation.update',
 'workerLGBT.update',
 'workerMaritalStatus.update',
 'workerPersonalEmail.update',
 'workerPersonalPhoneNumber.update']

## Building iterable datasets

In [74]:
batch_size = 5
n_iters = 1000000
num_epochs = n_iters/(len(x_train))/batch_size
num_epochs=int(num_epochs)
num_epochs

281

In [64]:
train_data=list(zip(x_train,y_train))
train_data[0:5]

[('cancel job evalu now pleas', 'evaluation.cancel'),
 ('del job board', 'jobBoard.delete'),
 ('chang job list', 'jobPosting.update'),
 ('suspend requisit', 'jobRequisition.hold'),
 ('updat job applic', 'jobApplication.update')]

In [65]:
test_data=list(zip(x_test,y_test))
test_data[-5:-1]

[('updat phone number', 'personPersonalPhoneNumber.update'),
 ('modifi cell', 'personPersonalPhoneNumber.update'),
 ('modifi cellphon', 'personPersonalPhoneNumber.update'),
 ('modifi cell phone number', 'personPersonalPhoneNumber.update')]

## Model 1 - BoW Classifier

In [66]:
class BoWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()
        
        ## Defining parameters for linear model
        self.linear = nn.Linear(vocab_size, num_labels)
    
    def forward(self, bow_vec):
        ## do the foward pass and implement non-linearity
        return F.log_softmax(self.linear(bow_vec))

In [133]:
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence.split():
        if word in word_to_ix:
            vec[word_to_ix[word]] += 1
    return vec.view(1,-1)

In [68]:
def make_target(label, label_to_idx):
    return torch.LongTensor([label_to_idx[label]])

In [69]:
n=1
sample_phrase=make_bow_vector(x_train[n],word_2_idx)
print(">> SENTENCE: {}".format(x_train[n]))
print(">> SENTIMENT: {}".format(y_train[n]))
print(">> INPUT SIZE: {}".format(sample_phrase.size()))
print(">> INPUT FORMAT: {}".format(type(sample_phrase)))

>> SENTENCE: establish assess
>> SENTIMENT: assessment.add
>> INPUT SIZE: torch.Size([1, 346])
>> INPUT FORMAT: <class 'torch.FloatTensor'>


In [70]:
model_bow = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

## Training

In [71]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.01
optimizer = optim.SGD(params=model_bow.parameters(), lr=learning_rate)

In [72]:
## simple forward to see if its working
out=model_bow(Variable(sample_phrase))
out

Variable containing:

Columns 0 to 9 
-4.2881 -4.3137 -4.3099 -4.3115 -4.2084 -4.2875 -4.3024 -4.2906 -4.3755 -4.2277

Columns 10 to 19 
-4.1846 -4.2087 -4.2776 -4.3100 -4.2904 -4.3533 -4.3110 -4.2199 -4.3169 -4.2235

Columns 20 to 29 
-4.2669 -4.3148 -4.3924 -4.2596 -4.3112 -4.2589 -4.2432 -4.3128 -4.3194 -4.3262

Columns 30 to 39 
-4.1679 -4.2687 -4.1892 -4.3090 -4.1652 -4.2343 -4.2558 -4.1782 -4.3275 -4.3351

Columns 40 to 49 
-4.3577 -4.1921 -4.3014 -4.2671 -4.2578 -4.3748 -4.2154 -4.2927 -4.3486 -4.2638

Columns 50 to 59 
-4.2311 -4.2913 -4.2759 -4.2958 -4.2563 -4.3747 -4.2279 -4.2698 -4.1729 -4.2724

Columns 60 to 69 
-4.3137 -4.2881 -4.3187 -4.1976 -4.3656 -4.3461 -4.3197 -4.3183 -4.1867 -4.3851

Columns 70 to 71 
-4.1812 -4.2334
[torch.FloatTensor of size 1x72]

In [75]:
iter = 0
for epoch in range(num_epochs):
    for (sent,label) in train_data:
        # Step 1 - clear the gradients
        model_bow.zero_grad()
        optimizer.zero_grad()
        
        ## Step 2- Prepare input and label
        bow_vec = Variable(make_bow_vector(sent, word_2_idx))
        target = Variable(make_target(label, label_2_idx))
        
        # Step 3 - Run forward pass
        output = model_bow(bow_vec)
        #print("Log probabilities - {}".format(log_probs))
        
        # Step 4 - Compute loss, gradients, update parameters
        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()
        
        iter+=1      
        ## Calculate final accuracy
        if iter % 1000 ==0:
            correct = 0
            total = 0
            for (sent,label) in test_data:
                bow_vec = Variable(make_bow_vector(sent, word_2_idx))
                target = Variable(make_target(label, label_2_idx))
                output = model_bow(bow_vec)
                _,predicted = torch.max(output.data,1)
                total += target.size(0)
                correct += (predicted == make_target(label, label_2_idx)).sum()
            accuracy = 100 * correct/total
            print('Iterations: {}. Loss: {}. Accuracy: {}'.format(iter,loss.data[0],accuracy))

Iterations: 1000. Loss: 0.1340835690498352. Accuracy: 81.52610441767068
Iterations: 2000. Loss: 0.025858299806714058. Accuracy: 81.52610441767068
Iterations: 3000. Loss: 0.038729552179574966. Accuracy: 81.52610441767068
Iterations: 4000. Loss: 0.09527319669723511. Accuracy: 81.52610441767068
Iterations: 5000. Loss: 0.34346723556518555. Accuracy: 81.12449799196787
Iterations: 6000. Loss: 0.020387429744005203. Accuracy: 81.52610441767068
Iterations: 7000. Loss: 0.15182150900363922. Accuracy: 81.92771084337349
Iterations: 8000. Loss: 0.07401514798402786. Accuracy: 81.52610441767068
Iterations: 9000. Loss: 0.727484405040741. Accuracy: 81.52610441767068
Iterations: 10000. Loss: 0.04679514840245247. Accuracy: 81.12449799196787
Iterations: 11000. Loss: 0.049376796931028366. Accuracy: 81.52610441767068
Iterations: 12000. Loss: 0.020930076017975807. Accuracy: 81.92771084337349
Iterations: 13000. Loss: 0.11241818219423294. Accuracy: 81.52610441767068
Iterations: 14000. Loss: 0.45110806822776794.

Iterations: 112000. Loss: 0.5877761840820312. Accuracy: 81.52610441767068
Iterations: 113000. Loss: 0.3367224335670471. Accuracy: 81.52610441767068
Iterations: 114000. Loss: 0.031178008764982224. Accuracy: 81.12449799196787
Iterations: 115000. Loss: 0.3775192201137543. Accuracy: 81.52610441767068
Iterations: 116000. Loss: 0.0542440302670002. Accuracy: 81.12449799196787
Iterations: 117000. Loss: 0.017885901033878326. Accuracy: 81.52610441767068
Iterations: 118000. Loss: 0.5838537812232971. Accuracy: 81.52610441767068
Iterations: 119000. Loss: 0.12161056697368622. Accuracy: 81.12449799196787
Iterations: 120000. Loss: 0.0031082206405699253. Accuracy: 81.52610441767068
Iterations: 121000. Loss: 0.14370451867580414. Accuracy: 81.12449799196787
Iterations: 122000. Loss: 0.4986082911491394. Accuracy: 81.52610441767068
Iterations: 123000. Loss: 0.06998324394226074. Accuracy: 81.52610441767068
Iterations: 124000. Loss: 0.059031520038843155. Accuracy: 81.12449799196787
Iterations: 125000. Loss: 

## Testing predictions

In [118]:
log_loss_model = loss.data[0]
accuracy

81.92771084337349

In [127]:
input_phrase = "update my cell phone"
input_phrase_preprocessed = transformText(input_phrase,do_stop=False,do_stem=True)
bow_vec = Variable(make_bow_vector(input_phrase_preprocessed, word_2_idx))
print("-"*20 + " INPUT "+"-"*20)
print("SENTENCE           = {}".format(input_phrase))
print("SENTENCE PROCESSED = {}".format(input_phrase_preprocessed))
print("-"*20 + " PREDICTION "+"-"*20)
log_probs = model_bow(bow_vec)
_,predicted = torch.max(log_probs.data,1)
pred_label=list(label_2_idx.keys())[list(label_2_idx.values()).index(predicted[0])]
print("PRED       = {}".format(predicted[0]))
print("PRED LABEL = {}".format(pred_label))
#print("LOG_PROB = {}".format(log_probs))
#print("PROBS = {}".format(F.softmax(log_probs)))

-------------------- INPUT --------------------
SENTENCE           = update my cell phone
SENTENCE PROCESSED = updat cell phone
-------------------- PREDICTION --------------------
PRED       = 56
PRED LABEL = personPersonalPhoneNumber.update


In [245]:
def get_reply_proba(msg):
    input_phrase_preprocessed = transformText(msg,do_stop=False,do_stem=True)
    bow_vec = Variable(make_bow_vector(input_phrase_preprocessed, word_2_idx))
    log_probs = model_bow(bow_vec)
    _,predicted = torch.max(log_probs.data,1)
    pred_label=list(label_2_idx.keys())[list(label_2_idx.values()).index(predicted[0])]
    
    ## Getting probabilities
    probs = F.softmax(log_probs.data)
    probs = probs.data[0].numpy()
    probs_frame = pd.DataFrame({'probs':probs,'label':list(label_2_idx)})
    probs_frame=probs_frame.sort_values(['probs'], ascending=False)[0:3].reset_index(drop=True)
    return pred_label,probs_frame

In [246]:
get_reply_proba("change employee phone")[1]

Unnamed: 0,label,probs
0,workerPersonalPhoneNumber.update,0.9373
1,personPersonalPhoneNumber.update,0.044293
2,workerPersonalEmail.update,0.004063


In [247]:
get_reply_proba("start job board")[1]

Unnamed: 0,label,probs
0,jobBoard.create,0.838538
1,jobBoard.update,0.070859
2,jobBoard.delete,0.067941


In [248]:
get_reply_proba("remove job board")[1]

Unnamed: 0,label,probs
0,jobBoard.delete,0.888908
1,jobBoard.create,0.051807
2,jobBoard.update,0.045804
