In [1]:
# Torch, Sklearn imports
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [2]:
## AllenNLP
import allennlp
from allennlp.modules.elmo import Elmo, batch_to_ids
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from allennlp.modules.token_embedders import BertEmbedder

print("PyTorch: {}".format(torch.__version__))
print("AllenNLP: {}".format(allennlp.__version__))

PyTorch: 1.0.0
AllenNLP: 0.8.0


In [3]:
## NLP libs
from nltk import download
import gensim
from nltk.corpus import stopwords
download('stopwords')

## Sklearn imports
from sklearn.datasets import fetch_20newsgroups

## General libs
import numpy as np
import pandas as pd
from string import punctuation
import os, re, sys, json, requests, pickle

02/06/2019 18:49:18 - INFO - summarizer.preprocessing.cleaner -   'pattern' package not found; tag filters are not available for English


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rsilvei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
!ls

1.sentence_interpolation.ipynb
2.snips_classifier_no_augmentation.ipynb
3.twentynews_classifier_no_augmentation.ipynb
4-sst1_classifier_no_augmentation.ipynb
README.md
[1m[36mdataset[m[m
dataset_handler.ipynb


In [5]:
sst1_dataset = pd.read_pickle('dataset/SST1_capado.pkl')
sst1_dataset.tail()

Unnamed: 0,sentence,label,split
72115,the problem with concept films is that if the ...,1,test
72116,"safe conduct , however ambitious and well inte...",1,test
72117,"a film made with as little wit , interest , an...",1,test
72118,to enjoy this movie 's sharp dialogue and deli...,2,test
72119,"but here 's the real damn it is n't funny , ei...",0,test


In [6]:
list(set(sst1_dataset.label))

['3', '1', '2', '4', '0']

In [7]:
## Number of sentences per label

In [8]:
for i in list(set(sst1_dataset.label)):
    print("LABEL {} - PHRASES {}".format(i,len(sst1_dataset[sst1_dataset.label==i])))

LABEL 3 - PHRASES 18158
LABEL 1 - PHRASES 17141
LABEL 2 - PHRASES 24550
LABEL 4 - PHRASES 6622
LABEL 0 - PHRASES 5649


In [9]:
train_dataset = sst1_dataset[sst1_dataset.split == 'train'].reset_index(drop=True)
test_dataset = sst1_dataset[sst1_dataset.split == 'test'].reset_index(drop=True)
len(train_dataset), len(test_dataset)

(68916, 2140)

In [10]:
train_dataset['label'] = train_dataset['label'].apply(lambda x: int(x))
test_dataset['label'] = test_dataset['label'].apply(lambda x: int(x))

In [11]:
type(test_dataset['label'][0])

numpy.int64

In [12]:
train_dataset.tail()

Unnamed: 0,sentence,label,split
68911,so much fun dissing the film that they did n't...,1,train
68912,had so much fun dissing the film that they did...,0,train
68913,walked out muttering words like `` horrible ''...,1,train
68914,walked out muttering words like `` horrible ''...,0,train
68915,her fans walked out muttering words like `` ho...,0,train


In [13]:
for i in list(set(train_dataset.label)):
    print("LABEL {} - PHRASES {}".format(i,len(train_dataset[train_dataset.label==i])))

LABEL 0 - PHRASES 5240
LABEL 1 - PHRASES 16248
LABEL 2 - PHRASES 23973
LABEL 3 - PHRASES 17383
LABEL 4 - PHRASES 6072


In [31]:
train_pruned = pd.DataFrame(columns = ['sentence', 'label','split'])
valid_pruned = pd.DataFrame(columns = ['sentence', 'label','split'])

In [32]:
type(train_dataset.label[0])

numpy.int64

In [33]:
pruned_num_phrases = 150

In [34]:
for i in list(set(train_dataset.label)):
    print("LABEL {}".format(i))
    train_ = train_dataset[train_dataset.label==i][0:pruned_num_phrases]
    valid_ = train_dataset[train_dataset.label==i][pruned_num_phrases:]
    print(len(train_), len(valid_))
    train_pruned = pd.concat([train_pruned,train_]).reset_index(drop=True)
    valid_pruned = pd.concat([valid_pruned,valid_]).reset_index(drop=True)

LABEL 0
150 5090
LABEL 1
150 16098
LABEL 2
150 23823
LABEL 3
150 17233
LABEL 4
150 5922


In [35]:
train_pruned.tail()

Unnamed: 0,sentence,label,split
745,if you 're a fan of the series you 'll love it...,4,train
746,"the story ... is inspiring , ironic , and reve...",4,train
747,"dark humor , gorgeous exterior photography ,",4,train
748,a stable full of solid performances,4,train
749,"dark humor , gorgeous exterior photography , a...",4,train


In [36]:
valid_pruned.tail()

Unnamed: 0,sentence,label,split
68161,red dragon '' never cuts corners .,4,train
68162,that makes the formula fresh again,4,train
68163,an enthusiastic charm in fire that makes the f...,4,train
68164,there 's an enthusiastic charm in fire that ma...,4,train
68165,we 've seen the hippie turned yuppie plot befo...,4,train


In [37]:
## Lemmatization function based on Spacy Library
def lemmatizer_spacy(text):        
    sent = []
    doc = spacy_en(text)
    for word in doc:
        if word.lemma_ == "-PRON-":
            sent.append(word.text)
        else:
            sent.append(word.lemma_)
    return " ".join(sent)

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

In [38]:
def transformText(text, do_stop=True, do_stem=False, do_lema = False):
    stops = set(stopwords.words("english"))
    # Convert text to lower
    text = text.lower()
    
    # Removing E-mails  
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    text = re.sub(r'\S*@\S*\s?', r' ', text)
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Removing Newline
    text = text.rstrip()
    # Removing all the stopwords
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Stemming
    if (do_stem==True):
        text = gensim.parsing.preprocessing.stem_text(text)
    
    # Lemmatization
    if (do_lema==True):
        text = do_lemmatization(text)   
        
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    return text

In [40]:
## Clean Dataset (only stopword removal, punct, ascii - no lemma, stemm)
train_pruned['clean_text']=train_pruned['sentence'].apply(lambda x: transformText(x))
valid_pruned['clean_text']=valid_pruned['sentence'].apply(lambda x: transformText(x))

In [41]:
train_pruned.sentence[0]

"'s not life affirming its vulgar and mean"

In [42]:
train_pruned.clean_text[0]

' s life affirming vulgar mean'

In [43]:
valid_pruned.sentence[200]

'will be lulled into a coma .'

In [44]:
valid_pruned.clean_text[200]

'lulled coma '

In [45]:
## ELMo
elmo_weights_key_path = '../../vectors/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
elmo_config_key_path = '../../vectors/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'

## S3 Configs for SageMaker
#bucket = 'adp-e-ml-notebooks-sagemaker'             
#prefix = 'vectors'   
#elmo_weights_key = '{}/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'.format(prefix)
#elmo_weights_key_path = os.path.join('s3://', bucket, elmo_weights_key)
#elmo_config_key = '{}/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'.format(prefix)
#elmo_config_key_path = os.path.join('s3://', bucket, elmo_config_key)

### Elmo Instance
elmo = Elmo(elmo_config_key_path, 
            elmo_weights_key_path, 
            num_output_representations = 1, 
            dropout=0.3,
            requires_grad = False)
if torch.cuda.is_available():
    elmo = elmo.cuda()

02/06/2019 18:54:35 - INFO - allennlp.modules.elmo -   Initializing ELMo


In [46]:
def get_elmo(sent):
    elmo.eval()
    sent = [sent.split()]
    character_ids = batch_to_ids(sent)
    if torch.cuda.is_available():
        character_ids = character_ids.cuda()
    embeddings = elmo(character_ids)
    rep = embeddings['elmo_representations'][0]
    rep = rep.squeeze(dim=0)
    avg = rep.mean(dim=0)
    return avg

In [47]:
get_elmo("testing this")

tensor([-0.3639,  0.1719,  0.0151,  ...,  0.5625, -0.5524, -0.0254],
       grad_fn=<MeanBackward0>)

In [48]:
len(train_pruned), len(valid_pruned)

(750, 68166)

In [49]:
## Data Loading Class
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        phrase = self.data.clean_text[index]
        X = get_elmo(phrase)
        y = self.data.label[index]
        return X, y
    
    def __len__(self):
        return self.len

In [50]:
train_set = Intents(train_pruned)
valid_set = Intents(valid_pruned)

In [51]:
train_set.__len__(), valid_set.__len__()

(750, 68166)

In [52]:
train_set.__getitem__(1)

(tensor([-0.1988, -0.0597, -0.0195,  ..., -0.0663,  0.0579,  0.2277],
        grad_fn=<MeanBackward0>), 0)

In [53]:
valid_set.__getitem__(2000)

(tensor([-0.2477, -0.2810,  0.0854,  ...,  0.2093, -0.0032,  0.1197],
        grad_fn=<MeanBackward0>), 0)

## Simple MLP Classifier

In [54]:
class SimpleMLP(nn.Module):
    def __init__(self, inputdim, 
                        nclasses, 
                        nhidden, 
                        dropout = 0):
        super(SimpleMLP, self).__init__()
        """
        PARAMETERS:
        -dropout:    dropout for MLP
        """
        
        self.inputdim = inputdim
        self.hidden_dim = nhidden
        self.dropout = dropout
        self.nclasses = nclasses
        self.model = nn.Sequential(
            nn.Linear(self.inputdim, nhidden),
            nn.Dropout(p=self.dropout),
            nn.ReLU(),
            nn.Linear(nhidden, self.nclasses),
            )
        if torch.cuda.is_available():
            self.model = self.model.cuda()
    def forward(self, x):
        log_probs = self.model(x)
        return log_probs

In [55]:
len(list(set(train_set.data.label)))

5

In [56]:
INP_DIM = elmo.get_output_dim()
NUM_LABELS = len(list(set(train_set.data.label)))
NHIDDEN = 64
DROPOUT = 0

In [57]:
model = SimpleMLP(inputdim = INP_DIM ,
              nhidden = NHIDDEN,
              nclasses = NUM_LABELS,
              dropout = DROPOUT)

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    model = model.to(device)
    model.cuda()

## Training

In [58]:
# Dataloaders Parameters
params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 0}
train_loader = DataLoader(train_set, **params)
valid_loader = DataLoader(valid_set, **params)
# Hyperparams
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001 
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [59]:
max_epochs = 5
for epoch in range(max_epochs):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(train_loader):
        
        ## Step 1 - Clear gradients w.r.t. parameters
        optimizer.zero_grad()
        
        if torch.cuda.is_available():
            sent = sent.cuda()
            label = label.cuda()
        
        ## Step 2 - Run forward pass
        output = model.forward(sent)
        
        ## Step 3 - Compute loss
        loss = loss_function(output, label)
        loss.backward()
        
        ## Step 4 = Update parameters
        optimizer.step()
        if i%50 == 0:
            
            # Calculate Accuracy         
            correct = 0
            total = 0
            
            for sent, label in valid_loader:      
                if torch.cuda.is_available():
                    sent = sent.cuda()
                    label = label.cuda()
                
                # Forward pass only to get logits/output
                output = model.forward(sent)
                
                # Get predictions from the maximum value
                _, predicted = torch.max(output.data, 1)
                
                # Total number of labels
                total += label.size(0)

                # Total correct predictions
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            # Print Loss
            print('LOSS: {}. VALID ACCURACY: {}%'.format(loss.data, accuracy))

EPOCH -- 0


KeyboardInterrupt: 

In [None]:
def get_reply(phrase):
    x = get_elmo(phrase)
    logits_out = model.forward(x)
    softmax_out = F.softmax(logits_out, dim=0).cpu()
    _, pred_label = torch.max(softmax_out.data, 0)
    prediction=list(label_to_ix.keys())[pred_label]
    return prediction

In [None]:
get_reply("change this music")

In [None]:
get_reply("weather in Porto Alegre")

## Checking test error

In [None]:
correct = 0
total = 0
errors = []
accuracy = 0
for i in range(len(test_dataset)):
    msg = str(test_dataset['clean_text'][i])
    lbl = str(test_dataset['intent'][i])
    pred = get_reply(msg)
    total +=1
    if pred == lbl:
        correct += 1
    else:
        errors.append((msg,lbl))
test_accuracy = 100.00 * correct / total

In [None]:
print("TEST ACCURACY  -- {}".format(test_accuracy))