In [1]:
# Torch, Sklearn imports
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [2]:
## AllenNLP
import allennlp
from allennlp.modules.elmo import Elmo, batch_to_ids
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from allennlp.modules.token_embedders import BertEmbedder

print("PyTorch: {}".format(torch.__version__))
print("AllenNLP: {}".format(allennlp.__version__))

PyTorch: 1.0.0
AllenNLP: 0.8.0


In [3]:
## NLP libs
from nltk import download
import gensim
from nltk.corpus import stopwords
download('stopwords')

## General libs
import numpy as np
import pandas as pd
from string import punctuation
import os, re, sys, json, requests, pickle

02/06/2019 09:57:23 - INFO - summarizer.preprocessing.cleaner -   'pattern' package not found; tag filters are not available for English


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rsilvei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#!pip install -U pandas

In [5]:
with open("dataset/snips_sliced_train.pkl",'rb') as f:
    train_dataset = pickle.load(f)

In [6]:
train_dataset.tail()

Unnamed: 0,phrase,intent
695,Let me listen to an eighties ep.,PlayMusic
696,Play Les Lecter Smith from deezer.,PlayMusic
697,I want to hear Merry Go Round by Gary Nichols ...,PlayMusic
698,Please help me find the Bloom: Remix Album song.,SearchCreativeWork
699,I would like to hear something from Groove Shark,PlayMusic


In [None]:
with open("dataset/snips_sliced_test.pkl",'rb') as f:
    test_dataset = pickle.load(f)

In [None]:
test_dataset.tail()

In [None]:
## Lemmatization function based on Spacy Library
def lemmatizer_spacy(text):        
    sent = []
    doc = spacy_en(text)
    for word in doc:
        if word.lemma_ == "-PRON-":
            sent.append(word.text)
        else:
            sent.append(word.lemma_)
    return " ".join(sent)

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

In [None]:
def transformText(text, do_stop=True, do_stem=False, do_lema = False):
    stops = set(stopwords.words("english"))
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Stemming
    if (do_stem==True):
        text = gensim.parsing.preprocessing.stem_text(text)
    
    # Lemmatization
    if (do_lema==True):
        text = do_lemmatization(text)        
    return text

In [None]:
## Clean Dataset (only stopword removal, punct, ascii - no lemma, stemm)
train_dataset['clean_text']=train_dataset['phrase'].apply(lambda x: transformText(x))
test_dataset['clean_text']=test_dataset['phrase'].apply(lambda x: transformText(x))

In [None]:
test_dataset.tail()

In [None]:
## Build word vocabulary
word_to_ix = {}
for sent in train_dataset.clean_text:
    for word in sent.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

print("Dict Size: {}".format(len(word_to_ix)))
## Build label vocabulary
label_to_ix = {}
for label in train_dataset.intent:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
print("Num labels: {}".format(len(label_to_ix)))

In [None]:
## ELMo
elmo_weights_key_path = '../vectors/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
elmo_config_key_path = '../vectors/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'

### Elmo Instance
elmo = Elmo(elmo_config_key_path, 
            elmo_weights_key_path, 
            num_output_representations = 1, 
            dropout=0.3,
            requires_grad = False)
if torch.cuda.is_available():
    elmo = elmo.cuda()

In [None]:
def get_elmo(sent):
    elmo.eval()
    sent = [sent.split()]
    character_ids = batch_to_ids(sent)
    if torch.cuda.is_available():
        character_ids = character_ids.cuda()
    embeddings = elmo(character_ids)
    rep = embeddings['elmo_representations'][0]
    rep = rep.squeeze(dim=0)
    avg = rep.mean(dim=0)
    return avg

In [None]:
get_elmo("testing this")

In [None]:
## Train/Valid split
data_split = int(0.8*len(train_dataset))
train = train_dataset[:data_split]
valid = train_dataset[data_split:-1].reset_index(drop=True)

In [None]:
len(train), len(valid)

In [None]:
## Data Loading Class
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.label_to_ix = {}
        self.data = dataframe
        
    def __getitem__(self, index):
        phrase = self.data.clean_text[index]
        X = get_elmo(phrase)
        y = label_to_ix[self.data.intent[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [None]:
train_set = Intents(train)
valid_set = Intents(valid)

In [None]:
train_set.__len__(), valid_set.__len__()

In [None]:
train_set.__getitem__(1)

In [None]:
valid_set.__getitem__(1)

## Simple MLP Classifier

In [None]:
class SimpleMLP(nn.Module):
    def __init__(self, inputdim, 
                        nclasses, 
                        nhidden, 
                        dropout = 0):
        super(SimpleMLP, self).__init__()
        """
        PARAMETERS:
        -dropout:    dropout for MLP
        """
        
        self.inputdim = inputdim
        self.hidden_dim = nhidden
        self.dropout = dropout
        self.nclasses = nclasses
        self.model = nn.Sequential(
            nn.Linear(self.inputdim, nhidden),
            nn.Dropout(p=self.dropout),
            nn.ReLU(),
            nn.Linear(nhidden, self.nclasses),
            )
        if torch.cuda.is_available():
            self.model = self.model.cuda()
    def forward(self, x):
        log_probs = self.model(x)
        return log_probs

In [None]:
INP_DIM = elmo.get_output_dim()
NUM_LABELS = len(label_to_ix)
NHIDDEN = 32
DROPOUT = 0

In [None]:
model = SimpleMLP(inputdim = INP_DIM ,
              nhidden = NHIDDEN,
              nclasses = NUM_LABELS,
              dropout = DROPOUT)

device = torch.device("cuda:0")
model = model.to(device)
model.cuda()

## Training

In [None]:
# Dataloaders Parameters
params = {'batch_size': 32,
          'shuffle': True,
          'num_workers': 0}
train_loader = DataLoader(train_set, **params)
valid_loader = DataLoader(valid_set, **params)
# Hyperparams
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001 
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [None]:
max_epochs = 10
for epoch in range(max_epochs):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(train_loader):
        
        ## Step 1 - Clear gradients w.r.t. parameters
        optimizer.zero_grad()
        
        if torch.cuda.is_available():
            sent = sent.cuda()
            label = label.cuda()
        
        ## Step 2 - Run forward pass
        output = model.forward(sent)
        
        ## Step 3 - Compute loss
        loss = loss_function(output, label)
        loss.backward()
        
        ## Step 4 = Update parameters
        optimizer.step()
        if i%50 == 0:
            
            # Calculate Accuracy         
            correct = 0
            total = 0
            
            for sent, label in valid_loader:      
                if torch.cuda.is_available():
                    sent = sent.cuda()
                    label = label.cuda()
                
                # Forward pass only to get logits/output
                output = model.forward(sent)
                
                # Get predictions from the maximum value
                _, predicted = torch.max(output.data, 1)
                
                # Total number of labels
                total += label.size(0)

                # Total correct predictions
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            # Print Loss
            print('LOSS: {}. VALID ACCURACY: {}%'.format(loss.data, accuracy))

In [None]:
def get_reply(phrase):
    x = get_elmo(phrase)
    logits_out = model.forward(x)
    softmax_out = F.softmax(logits_out, dim=0).cpu()
    _, pred_label = torch.max(softmax_out.data, 0)
    prediction=list(label_to_ix.keys())[pred_label]
    return prediction

In [None]:
get_reply("change this music")

In [None]:
get_reply("weather in Porto Alegre")

## Checking test error

In [None]:
correct = 0
total = 0
errors = []
accuracy = 0
for i in range(len(test_dataset)):
    msg = str(test_dataset['clean_text'][i])
    lbl = str(test_dataset['intent'][i])
    pred = get_reply(msg)
    total +=1
    if pred == lbl:
        correct += 1
    else:
        errors.append((msg,lbl))
test_accuracy = 100.00 * correct / total

In [None]:
print("TEST ACCURACY  -- {}".format(test_accuracy))