In [91]:
#!pip install pandas==0.24.1

In [92]:
import pandas as pd
import numpy as np
import json, re

# Torch, Sklearn imports
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, RandomSampler
print(torch.__version__)

## Embeddings
import allennlp
from allennlp.modules.elmo import Elmo, batch_to_ids
from gensim.models import KeyedVectors

## NLP libs
from nltk import download
import gensim

stopwords = {"ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "this", "down", "should", "our", "their", "while", "above", "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than"}

1.3.0


In [93]:
import math
import torch
import gpytorch
from matplotlib import pyplot as plt
import numpy as np

%matplotlib inline

In [94]:
!ls ../../../personal/intents_uncertainty/

[1m[36m2017-06-custom-intent-engines[m[m
README.md
Untitled.ipynb
intent_classifier_uncertainty_mc_dropout.ipynb
model__uncertainty.pth
snips_dataset.csv


In [95]:
path_dataset = '../../../personal/intents_uncertainty/2017-06-custom-intent-engines/'

In [96]:
# dataset = pd.DataFrame(columns = ['phrase', 'intent'])
# for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
#                'SearchScreeningEvent']:
#     with open(path_dataset + intent + "/train_" + intent + ".json",
#               encoding='cp1251') as data_file:
#         data = json.load(data_file)
#     print("Intent: {}, Length: {}".format(intent,len(data[intent])))
#     texts = []
#     for i in range(len(data[intent])):
#         text = ''
#         for j in range(len(data[intent][i]['data'])):
#             text += data[intent][i]['data'][j]['text']
#         dataset = dataset.append({'phrase': text, 'intent': intent}, ignore_index=True)

In [97]:
dataset = pd.read_pickle('intents_phrases_183.pkl')
dataset = dataset.rename(columns={"usersays":"phrase"})
dataset.tail()

Unnamed: 0,phrase,intent
2671,modify her military information,workerVeteranStatus.update
2672,modify worker military status,workerVeteranStatus.update
2673,change employee veteran status,workerVeteranStatus.update
2674,change his military status,workerVeteranStatus.update
2675,update Brian's veteran status,workerVeteranStatus.update


In [98]:
def transformText(text, do_stop=False, do_stem=False):
    # Convert text to lower
    text = text.lower()
    
    # Cleaning input
    text = text.replace("'s","")
    text = text.replace("’s","")
    text = text.replace("?","")
    text = text.replace("-","")
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Removing all the stopwords
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stopwords]
    else:
        filtered_words = [word for word in text.split()]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    if (do_stem==True):
        # Stemming
        text = gensim.parsing.preprocessing.stem_text(text)
    return text

In [99]:
dataset['preproc_text'] = dataset['phrase'].apply(lambda x: transformText(x, do_stop=True))
dataset.tail(5)

Unnamed: 0,phrase,intent,preproc_text
2671,modify her military information,workerVeteranStatus.update,modify military information
2672,modify worker military status,workerVeteranStatus.update,modify worker military status
2673,change employee veteran status,workerVeteranStatus.update,change employee veteran status
2674,change his military status,workerVeteranStatus.update,change military status
2675,update Brian's veteran status,workerVeteranStatus.update,update brian veteran status


In [100]:
## Build word vocabulary
word_to_ix = {}
for sent in dataset.preproc_text:
    for word in sent.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print("Tamanho do dicionario: {}".format(len(word_to_ix)))

Tamanho do dicionario: 792


In [101]:
## Build label vocabulary
label_to_ix = {}
for label in dataset.intent:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
print("# Labels: {}".format(len(label_to_ix)))

# Labels: 183


In [102]:
class Intents(Dataset):
    def __init__(self, dataframe, w2v_weights_path):
        self.len = len(dataframe)
        self.label_to_ix = {}
        self.data = dataframe
        self.w2v = KeyedVectors.load_word2vec_format(w2v_weights_path, binary = True)
        
    def __getitem__(self, index):
        phrase = self.data.preproc_text[index]
        X, _  = self.get_avg_sentence_vector(phrase)
        y = label_to_ix[self.data.intent[index]]
        return X, y
    
    def __len__(self):
        return self.len

    def get_avg_sentence_vector(self, sentence):
        featureVec = np.zeros((self.w2v.vector_size), dtype="float32")
        nwords = 0
        not_found_words = []
        for word in sentence.split():
            if word in self.w2v.index2word:
                nwords = nwords+1
                featureVec = np.add(featureVec, self.w2v.get_vector(word))
            else:
                not_found_words.append(word)
        if nwords>0:
            featureVec = np.divide(featureVec, nwords)
        return featureVec, not_found_words

In [103]:
w2v_weights_path = '../../../vectors/GoogleNews-vectors-negative300.bin'

In [104]:
train_size = 0.8
train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [105]:
print("FULL Dataset: {}".format(dataset.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (2676, 3)
TRAIN Dataset: (2141, 3)
TEST Dataset: (535, 3)


In [107]:
training_set.__getitem__(0)[1]

49

In [108]:
training_set.__getitem__(0)[1]

49

In [109]:
training_set = Intents(train_dataset,  w2v_weights_path)
testing_set = Intents(test_dataset, w2v_weights_path)

In [151]:
# Parameters
params = {'batch_size': 16,
          'shuffle': True,
          'drop_last': True,
          'num_workers': 0}

In [152]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [153]:
training_loader

<torch.utils.data.dataloader.DataLoader at 0x4861b4390>

In [154]:
sample_phrases, sample_labels = next(iter(training_loader))

In [155]:
sample_phrases.shape

torch.Size([16, 300])

In [156]:
# train_x = torch.zeros((0,300))
# train_y = torch.zeros(0, dtype=torch.int64)
# for i, (x, y)  in enumerate(training_loader):
#     print("--- ", i)
#     train_x = torch.cat((train_x,x),0)
#     train_y = torch.cat((train_y,y),0)

In [157]:
train_x.shape

torch.Size([2112, 300])

## Standard Classification

In [158]:
class SimpleMLP(nn.Module):
    def __init__(self, inputdim, 
                        nclasses, 
                        nhidden, 
                        dropout = 0,
                        cudaEfficient=True):
        super(SimpleMLP, self).__init__()
        
        self.inputdim = inputdim
        self.hidden_dim = nhidden
        self.dropout = dropout
        self.nclasses = nclasses
        
        if cudaEfficient:
            self.model = nn.Sequential(
                nn.Linear(self.inputdim, nhidden),
                nn.Dropout(p=self.dropout),
                nn.ReLU(),
                nn.Linear(nhidden, self.nclasses),
                ).cuda()
        else:
            self.model = nn.Sequential(
                nn.Linear(self.inputdim, nhidden),
                nn.Dropout(p=self.dropout),
                nn.ReLU(),
                nn.Linear(nhidden, self.nclasses),
                )
    def forward(self, x):
        log_probs = self.model(x)
        return log_probs

In [159]:
INP_DIM = training_set.w2v.vector_size
NUM_LABELS = len(label_to_ix)
NHIDDEN = 512
DROPOUT = 0.3
model = SimpleMLP(inputdim = INP_DIM ,
          nhidden = NHIDDEN,
          nclasses = NUM_LABELS,
          dropout = DROPOUT, 
          cudaEfficient = False)

In [160]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)
max_epochs = 5

In [161]:
sent, label = next(iter(training_loader))

In [162]:
sent.shape, label

(torch.Size([16, 300]),
 tensor([ 26,  54, 112, 156, 122, 172,  68, 149,  85, 155,  47, 178,  19, 116,
         162,   0]))

In [163]:
out = model.forward(sent)
print(out.shape)
print(label.shape)
out

torch.Size([16, 183])
torch.Size([16])


tensor([[ 0.0365,  0.0076, -0.0119,  ..., -0.0663, -0.0285, -0.0482],
        [ 0.0118,  0.0093, -0.0086,  ..., -0.0373, -0.0271, -0.0134],
        [-0.0038,  0.0147, -0.0383,  ..., -0.0472,  0.0040, -0.0483],
        ...,
        [-0.0170,  0.0533,  0.0013,  ..., -0.0389, -0.0172, -0.0678],
        [ 0.0336,  0.0031, -0.0378,  ..., -0.0263,  0.0331, -0.0677],
        [ 0.0202,  0.0667, -0.0040,  ..., -0.0411,  0.0082, -0.0644]],
       grad_fn=<AddmmBackward>)

In [164]:
loss_function(out, label)

tensor(5.2124, grad_fn=<NllLossBackward>)

In [40]:
def train(model, epochs):
    max_epochs = epochs
    model = model.train()
    for epoch in tqdm_notebook(range(max_epochs)):
        print("EPOCH -- {}".format(epoch))
        for i, (sent, labels) in enumerate(training_loader):
            optimizer.zero_grad()
            output = model.forward(sent)
            loss = loss_function(output, labels)
            loss.backward()
            optimizer.step()
        if i%500 == 0:
            correct = 0
            total = 0
            
            output = model.forward(ids.cuda(), token_type_ids=tokens.cuda(), head_mask=None)[0]
                  _, predicted = torch.max(output.data, 1)
                  total += labels.size(0)
                  correct += (predicted.cpu() == labels.cpu()).sum()
              accuracy = 100.00 * correct.numpy() / total
              print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))
    return "Training finished!"

EPOCH -- 0
Iteration: 0. Loss: 5.208669185638428. Accuracy: 0.1953125%
EPOCH -- 1
Iteration: 0. Loss: 4.407190322875977. Accuracy: 10.9375%
EPOCH -- 2
Iteration: 0. Loss: 3.0471835136413574. Accuracy: 33.3984375%
EPOCH -- 3
Iteration: 0. Loss: 2.327571392059326. Accuracy: 52.34375%
EPOCH -- 4
Iteration: 0. Loss: 1.5701735019683838. Accuracy: 63.4765625%


In [None]:
from gpytorch.distributions import base_distributions
from gpytorch.likelihoods.likelihood import Likelihood
from gpytorch.utils.deprecation import _deprecate_kwarg_with_transform

In [None]:
class SoftmaxLikelihood2(Likelihood):
    """
    Implements the Softmax (multiclass) likelihood used for GP classification.
    """

    def __init__(
        self, num_features=None, num_classes=None, mixing_weights=True, mixing_weights_prior=None, **kwargs
    ):
        num_classes = _deprecate_kwarg_with_transform(
            kwargs, "n_classes", "num_classes", num_classes, lambda n: n
        )
        super().__init__()
        if num_classes is None:
            raise ValueError("num_classes is required")
        self.num_classes = num_classes
        if mixing_weights:
            self.num_features = num_features
            if num_features is None:
                raise ValueError("num_features is required with mixing weights")
            self.register_parameter(
                name="mixing_weights",
                parameter=torch.nn.Parameter(torch.randn(num_classes, num_features).div_(num_features)),
            )
            if mixing_weights_prior is not None:
                self.register_prior("mixing_weights_prior", mixing_weights_prior, "mixing_weights")
        else:
            self.num_features = num_classes
            self.mixing_weights = None
        print(self.num_features)

    def forward(self, function_samples, *params, **kwargs):
        num_features, num_data = function_samples.shape[-2:]
        if num_features != self.num_features:
            raise RuntimeError("There should be %d features" % self.num_features)

        if self.mixing_weights is not None:
            mixed_fs = self.mixing_weights @ function_samples  # num_classes x num_data
        else:
            mixed_fs = function_samples
        mixed_fs = mixed_fs.transpose(-1, -2)  # num_data x num_classes
        res = base_distributions.Categorical(logits=mixed_fs)
        return res

## Gaussian Process Classification

In [None]:
train_x.size(0)

In [None]:
train_x.shape

In [None]:
from gpytorch.models import AbstractVariationalGP
from gpytorch.variational import CholeskyVariationalDistribution
from gpytorch.variational import VariationalStrategy
from gpytorch.mlls.variational_elbo import VariationalELBO

class GPClassificationModel(AbstractVariationalGP):
    def __init__(self, train_x):
        variational_distribution = CholeskyVariationalDistribution(train_x.size(0))
        variational_strategy = VariationalStrategy(self, train_x, variational_distribution)
        super(GPClassificationModel, self).__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        latent_pred = gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
        return latent_pred

In [None]:
# Initialize model and likelihood
model = GPClassificationModel(train_x)
#likelihood = gpytorch.likelihoods.BernoulliLikelihood()
likelihood = SoftmaxLikelihood2(num_features=300,
                                mixing_weights=True,
                                num_classes=len(label_to_ix))

In [None]:
## Training Parameters
learning_rate = 0.1
max_epochs = 10
model.train()
likelihood.train()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
mll = VariationalELBO(likelihood, model, train_y.numel())

In [None]:
train_x.shape, train_y.shape

In [None]:
output = model(train_x)
output.shape

In [None]:
train_y.shape

In [None]:
output

In [None]:
train_y.shape

In [None]:
-mll(output, train_y)

In [None]:
for epoch in range(max_epochs):
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        output = model.forward(sent)        
        loss = -mll(output, label)
        loss.backward(retain_graph=True)
        optimizer.step()
        print('Iter %d/%d - Loss: %.3f' % (i, epoch, loss.item()))
        
#         if i%100 == 0:
#             correct = 0
#             total = 0
#             for sent, label in testing_loader:
#                 sent = Variable(sent)
#                 label = Variable(label)
#                 output = model.forward(sent)
#                 _, predicted = torch.max(output.mean, 1)
#                 total += label.size(0)
#                 correct += (predicted.cpu() == label.cpu()).sum()
#             accuracy = 100.00 * correct.numpy() / total
#             print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

## Make predictions

In [None]:
# Go into eval mode


with torch.no_grad():
    # Test x are regularly spaced by 0.01 0,1 inclusive
    # Get classification predictions
    observed_pred = likelihood(model(test_x))

In [None]:
input_phrase = "i need to book a restaurant today"
model.eval()
likelihood.eval()
inp, _ = training_set.get_avg_sentence_vector(input_phrase)
inp = torch.tensor(inp)
observed_pred = likelihood(model(inp))

In [None]:
def get_reply(phrase):
    model.eval()
    likelihood.eval()
    inp, _ = training_set.get_avg_sentence_vector(phrase)
    inp = torch.Tensor(inp)
    output = model.forward(inp)
    observed_pred = likelihood(model(test_x))

#     # Get predictions from the maximum value
#     _, predicted = torch.max(output.data, 0)
#     pred_label=list(label_to_ix.keys())[list(label_to_ix.values()).index(predicted.item())]
#     return pred_label

In [None]:
get_reply(input_phrase)


In [None]:
## Sources
https://gpytorch.readthedocs.io/en/latest/examples/08_Deep_Kernel_Learning/Deep_Kernel_Learning_DenseNet_CIFAR_Tutorial.html