In [1]:
#!pip install torchcontrib

In [2]:
# Torch, Sklearn imports
import torch
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torchcontrib.optim import SWA

## Embeddings
import allennlp
from allennlp.modules.elmo import Elmo, batch_to_ids
from gensim.models import KeyedVectors

In [3]:
import pandas as pd
import numpy as np
import json, re

# Torch, Sklearn imports
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
print(torch.__version__)

1.2.0


In [4]:
## NLP libs
from nltk import download
import gensim
from nltk.corpus import stopwords
download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error Tunnel connection
[nltk_data]     failed: 502 Parent proxy unreacheable>


False

## Load Dataset

In [5]:
dataset = pd.DataFrame(columns = ['phrase', 'intent'])
for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
               'SearchScreeningEvent']:
    with open("./2017-06-custom-intent-engines/" + intent + "/train_" + intent + ".json",
              encoding='cp1251') as data_file:
        data = json.load(data_file)
    print("Intent: {}, Length: {}".format(intent,len(data[intent])))
    texts = []
    for i in range(len(data[intent])):
        text = ''
        for j in range(len(data[intent][i]['data'])):
            text += data[intent][i]['data'][j]['text']
        dataset = dataset.append({'phrase': text, 'intent': intent}, ignore_index=True)

Intent: AddToPlaylist, Length: 300
Intent: BookRestaurant, Length: 300
Intent: GetWeather, Length: 300
Intent: PlayMusic, Length: 300
Intent: RateBook, Length: 300
Intent: SearchCreativeWork, Length: 300
Intent: SearchScreeningEvent, Length: 300


In [6]:
def transformText(text, do_stop=False, do_stem=False):
    stops = set(stopwords.words("english"))
    # Convert text to lower
    text = text.lower()
    
    # Cleaning input
    text = text.replace("'s","")
    text = text.replace("’s","")
    text = text.replace("?","")
    text = text.replace("-","")
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Removing all the stopwords
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    if (do_stem==True):
        # Stemming
        text = gensim.parsing.preprocessing.stem_text(text)
    return text

def strip_punctuation(s):
    return ''.join(c for c in s if c not in PUNCT)

## Lemmatization function based on Spacy Library
def lemmatizer_spacy(text):        
    sent = []
    doc = spacy_en(text)
    for word in doc:
        if word.lemma_ == "-PRON-":
            sent.append(word.text)
        else:
            sent.append(word.lemma_)
    return " ".join(sent)

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

In [7]:
dataset['preproc_text'] = dataset['phrase'].apply(lambda x: transformText(x))

In [8]:
## Build label vocabulary
label_to_ix = {}
for label in dataset.intent:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)

In [9]:
class Intents(Dataset):
    def __init__(self, dataframe, w2v_weights_path):
        self.len = len(dataframe)
        self.label_to_ix = {}
        self.data = dataframe
        self.w2v = w2v = KeyedVectors.load_word2vec_format(w2v_weights_path, binary = True)
        
    def __getitem__(self, index):
        phrase = self.data.preproc_text[index]
        X, _  = self.get_avg_sentence_vector(phrase)
        y = label_to_ix[self.data.intent[index]]
        return X, y
    
    def __len__(self):
        return self.len

    def get_avg_sentence_vector(self, sentence):
        featureVec = np.zeros((self.w2v.vector_size), dtype="float32")
        nwords = 0
        not_found_words = []
        for word in sentence.split():
            if word in self.w2v.index2word:
                nwords = nwords+1
                featureVec = np.add(featureVec, self.w2v.get_vector(word))
            else:
                not_found_words.append(word)
        if nwords>0:
            featureVec = np.divide(featureVec, nwords)
        return featureVec, not_found_words

In [10]:
# Set data locations for embeddings
elmo_config_key_path = '../../../vectors/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
elmo_weights_key_path = '../../../vectors/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
w2v_weights_path = '../../../vectors/GoogleNews-vectors-negative300.bin'

In [11]:
!ls ../../../vectors/

GoogleNews-vectors-negative300.bin
crawl-300d-2M.vec
[31melmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json[m[m
[31melmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5[m[m
glove.42B.300d.txt
glove.840B.300d.txt
lid.176.ftz
[1m[36mprogram[m[m
[1m[36muncased_L-12_H-768_A-12[m[m
[1m[36muncased_L-24_H-1024_A-16[m[m
wiki-news-300d-1M-subword.txt
wiki-news-300d-1M.txt


In [12]:
train_size = 0.8
train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)
print("FULL Dataset: {}".format(dataset.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (2100, 3)
TRAIN Dataset: (1680, 3)
TEST Dataset: (420, 3)


In [13]:
training_set = Intents(train_dataset,  w2v_weights_path)
testing_set = Intents(test_dataset, w2v_weights_path)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Simple MLP Classifier

In [14]:
class SimpleMLP(nn.Module):
    def __init__(self, inputdim, 
                        nclasses, 
                        nhidden, 
                        dropout = 0,
                        cudaEfficient=True):
        super(SimpleMLP, self).__init__()
        """
        PARAMETERS:
        -dropout:    dropout for MLP
        """
        
        self.inputdim = inputdim
        self.hidden_dim = nhidden
        self.dropout = dropout
        self.nclasses = nclasses
        
        if cudaEfficient:
            self.model = nn.Sequential(
                nn.Linear(self.inputdim, nhidden),
                nn.Dropout(p=self.dropout),
                nn.ReLU(),
                nn.Linear(nhidden, self.nclasses),
                ).cuda()
        else:
            self.model = nn.Sequential(
                nn.Linear(self.inputdim, nhidden),
                nn.Dropout(p=self.dropout),
                nn.ReLU(),
                nn.Linear(nhidden, self.nclasses),
                )
    def forward(self, x):
        log_probs = self.model(x)
        return log_probs

In [15]:
INP_DIM = training_set.w2v.vector_size
NUM_LABELS = len(label_to_ix)
NHIDDEN = 512
DROPOUT = 0.3
model = SimpleMLP(inputdim = INP_DIM ,
          nhidden = NHIDDEN,
          nclasses = NUM_LABELS,
          dropout = DROPOUT, 
          cudaEfficient = False)

In [None]:
# Parameters
params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 1}
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [None]:
max_epochs = 5
for epoch in range(max_epochs):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = Variable(sent)
        label = Variable(label)
        output = model.forward(sent)
        _, predicted = torch.max(output.data, 1)
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = Variable(sent)
                label = Variable(label)
                output = model.forward(sent)
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

In [None]:
def get_reply(phrase):
    inp, _ = training_set.get_avg_sentence_vector(phrase)
    inp = Variable(torch.Tensor(inp))
    output = model.forward(inp)
    _, predicted = torch.max(output.data, 0)
    pred_label=list(label_to_ix.keys())[list(label_to_ix.values()).index(predicted.item())]
    return pred_label

In [None]:
get_reply("i need to book a restaurant today")

In [None]:
get_reply("play music")

In [None]:
get_reply("Obama is cool")

## Using Stochastic Weight Averaging (SWA) optimizer

In [None]:
# Parameters
params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 1}
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001
base_opt = optim.Adam(params =  model.parameters(), lr=learning_rate)
opt = SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)

In [None]:
max_epochs = 5
for epoch in range(max_epochs):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        opt.zero_grad()
        sent = Variable(sent)
        label = Variable(label)
        output = model.forward(sent)
        _, predicted = torch.max(output.data, 1)
        loss = loss_function(output, label)
        loss.backward()
        opt.step()
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = Variable(sent)
                label = Variable(label)
                output = model.forward(sent)
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))
opt.swap_swa_sgd()