In [1]:
import torch
import torchsso

import warnings
warnings.filterwarnings("ignore")

vecLib, which is a part of Accelerate, is known not to work correctly with Chainer.
We recommend using other BLAS libraries such as OpenBLAS.
For details of the issue, please see
https://docs.chainer.org/en/stable/tips.html#mnist-example-does-not-converge-in-cpu-mode-on-mac-os-x.

Please be aware that Mac OS X is not an officially supported OS.

  ''')  # NOQA


In [2]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook

# Torch, Sklearn imports
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, RandomSampler
print(torch.__version__)

## Embeddings
import allennlp
from allennlp.modules.elmo import Elmo, batch_to_ids
from gensim.models import KeyedVectors

## NLP libs
from nltk import download
import gensim

stopwords = {"ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "this", "down", "should", "our", "their", "while", "above", "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than"}

1.3.0


In [3]:
import math
import torch
from matplotlib import pyplot as plt
import numpy as np

%matplotlib inline

In [4]:
!ls ../../../personal/intents_uncertainty/

[1m[36m2017-06-custom-intent-engines[m[m
README.md
Untitled.ipynb
intent_classifier_uncertainty_mc_dropout.ipynb
model__uncertainty.pth
snips_dataset.csv


In [5]:
path_dataset = '../../../personal/intents_uncertainty/2017-06-custom-intent-engines/'

In [6]:
dataset = pd.read_pickle('intents_phrases_183.pkl')
dataset = dataset.rename(columns={"usersays":"phrase"})
dataset.tail()

Unnamed: 0,phrase,intent
2671,modify her military information,workerVeteranStatus.update
2672,modify worker military status,workerVeteranStatus.update
2673,change employee veteran status,workerVeteranStatus.update
2674,change his military status,workerVeteranStatus.update
2675,update Brian's veteran status,workerVeteranStatus.update


In [7]:
def transformText(text, do_stop=False, do_stem=False):
    # Convert text to lower
    text = text.lower()
    text = text.replace("'s","")
    text = text.replace("’s","")
    text = text.replace("?","")
    text = text.replace("-","")
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stopwords]
    else:
        filtered_words = [word for word in text.split()]
    text = " ".join(filtered_words)
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    if (do_stem==True):
        text = gensim.parsing.preprocessing.stem_text(text)
    return text

In [8]:
dataset['preproc_text'] = dataset['phrase'].apply(lambda x: transformText(x, do_stop=True))
dataset.tail(5)

Unnamed: 0,phrase,intent,preproc_text
2671,modify her military information,workerVeteranStatus.update,modify military information
2672,modify worker military status,workerVeteranStatus.update,modify worker military status
2673,change employee veteran status,workerVeteranStatus.update,change employee veteran status
2674,change his military status,workerVeteranStatus.update,change military status
2675,update Brian's veteran status,workerVeteranStatus.update,update brian veteran status


In [9]:
## Build label vocabulary
label_to_ix = {}
for label in dataset.intent:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
print("# Labels: {}".format(len(label_to_ix)))

# Labels: 183


In [10]:
class Intents(Dataset):
    def __init__(self, dataframe, w2v_weights_path):
        self.len = len(dataframe)
        self.label_to_ix = {}
        self.data = dataframe
        self.w2v = KeyedVectors.load_word2vec_format(w2v_weights_path, binary = True)
        
    def __getitem__(self, index):
        phrase = self.data.preproc_text[index]
        X, _  = self.get_avg_sentence_vector(phrase)
        y = label_to_ix[self.data.intent[index]]
        return X, y
    
    def __len__(self):
        return self.len

    def get_avg_sentence_vector(self, sentence):
        featureVec = np.zeros((self.w2v.vector_size), dtype="float32")
        nwords = 0
        not_found_words = []
        for word in sentence.split():
            if word in self.w2v.index2word:
                nwords = nwords+1
                featureVec = np.add(featureVec, self.w2v.get_vector(word))
            else:
                not_found_words.append(word)
        if nwords>0:
            featureVec = np.divide(featureVec, nwords)
        return featureVec, not_found_words

In [11]:
w2v_weights_path = '../../../vectors/GoogleNews-vectors-negative300.bin'

In [12]:
train_size = 0.8
train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [None]:
training_set = Intents(train_dataset,  w2v_weights_path)
testing_set = Intents(test_dataset, w2v_weights_path)

In [None]:
print("FULL Dataset: {}".format(dataset.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

In [None]:
training_set.__getitem__(0)[1]

In [None]:
training_set.__getitem__(0)[0].shape

In [None]:
# Parameters
params = {'batch_size': 8,
          'shuffle': True,
          'drop_last': True,
          'num_workers': 0}
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [None]:
sample_phrases, sample_labels = next(iter(training_loader))

In [None]:
sample_phrases.shape, sample_labels.shape

## MLP Model

In [None]:
class SimpleMLP(nn.Module):
    def __init__(self, inputdim, 
                        nclasses, 
                        nhidden, 
                        dropout = 0,
                        cudaEfficient=True):
        super(SimpleMLP, self).__init__()
        
        self.inputdim = inputdim
        self.hidden_dim = nhidden
        self.dropout = dropout
        self.nclasses = nclasses
        
        if cudaEfficient:
            self.model = nn.Sequential(
                nn.Linear(self.inputdim, nhidden),
                nn.Dropout(p=self.dropout),
                nn.ReLU(),
                nn.Linear(nhidden, self.nclasses),
                ).cuda()
        else:
            self.model = nn.Sequential(
                nn.Linear(self.inputdim, nhidden),
                nn.Dropout(p=self.dropout),
                nn.ReLU(),
                nn.Linear(nhidden, self.nclasses),
                )
    def forward(self, x):
        log_probs = self.model(x)
        return log_probs

In [None]:
INP_DIM = training_set.w2v.vector_size
NUM_LABELS = len(label_to_ix)
NHIDDEN = 512
DROPOUT = 0.3
model = SimpleMLP(inputdim = INP_DIM ,
          nhidden = NHIDDEN,
          nclasses = NUM_LABELS,
          dropout = DROPOUT, 
          cudaEfficient = False)

In [None]:
len(training_loader.dataset)

In [None]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)
#optimizer = torchsso.optim.VOGN(model, dataset_size=len(training_loader.dataset))
max_epochs = 5

In [None]:
sent, label = next(iter(training_loader))

In [None]:
out = model.forward(sent)
print(out.shape)
print(label.shape)
out

In [None]:
def train(model, epochs):
    max_epochs = epochs
    model = model.train()
    for epoch in tqdm_notebook(range(max_epochs)):
        print("EPOCH -- {}".format(epoch))
        for i, (sent, labels) in enumerate(training_loader):
            optimizer.zero_grad()
            output = model.forward(sent)
            loss = loss_function(output, labels)
            loss.backward()
            optimizer.step()
        if i%10 == 0:
            correct = 0
            total = 0
            for (sent, labels) in testing_loader:
                output = model.forward(sent)
                _, predicted = torch.max(output.data, 1)
                total += labels.size(0)
                correct += (predicted.cpu() == labels.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))
    return "Training finished!"

In [None]:
train(model,50)