In [1]:
#!pip install --upgrade pip
#!pip install -U numpy
#!pip install gensim 
#!pip install spacy==2.0.10
#!pip install greenlet
#!pip install allennlp --user
#!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu90/torch_nightly.html

### Cuda Nightly
"cuda_8": "https://download.pytorch.org/whl/nightly/cu80/torch_nightly.html",  
"cuda_9": "https://download.pytorch.org/whl/nightly/cu90/torch_nightly.html",  
"cuda_10": "https://download.pytorch.org/whl/nightly/cu100/torch_nightly.html",  
"no_cuda": "https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html",  

In [2]:
# Torch, Sklearn imports
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [3]:
## AllenNLP
import allennlp
from allennlp.modules.elmo import Elmo, batch_to_ids
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from allennlp.modules.token_embedders import BertEmbedder

print("PyTorch: {}".format(torch.__version__))
print("AllenNLP: {}".format(allennlp.__version__))
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print("Using CUDA device: {}".format(device))
    print("- # GPU device: {}".format(torch.cuda.device_count()))
    print("- Device Name: {}".format(torch.cuda.get_device_name(device)))
    print("- Device Proprierties: {}".format(torch.cuda.get_device_properties(device)))

PyTorch: 1.0.0.dev20190206
AllenNLP: 0.8.1
Using CUDA device: 0
- # GPU device: 1
- Device Name: GeForce GTX 1080 Ti
- Device Proprierties: _CudaDeviceProperties(name='GeForce GTX 1080 Ti', major=6, minor=1, total_memory=11175MB, multi_processor_count=28)


In [4]:
## NLP libs
from nltk import download
import gensim
from nltk.corpus import stopwords
download('stopwords')

## Sklearn imports
from sklearn.datasets import fetch_20newsgroups

## General libs
import numpy as np
import pandas as pd
from string import punctuation
import os, re, sys, json, requests, pickle

02/09/2019 19:16:56 - INFO - gensim.summarization.textcleaner -   'pattern' package not found; tag filters are not available for English
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/roberto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def transformText(text, do_stop=False, do_stem=False, do_lema = False):
    stops = set(stopwords.words("english"))
    # Convert text to lower
    text = text.lower()
    
    # Removing E-mails  
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    text = re.sub(r'\S*@\S*\s?', r' ', text)
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Removing Newline
    text = text.rstrip()
    # Removing all the stopwords
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Stemming
    if (do_stem==True):
        text = gensim.parsing.preprocessing.stem_text(text)
    
    # Lemmatization
    if (do_lema==True):
        text = do_lemmatization(text)   
        
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    return text

In [6]:
## Lemmatization function based on Spacy Library
def lemmatizer_spacy(text):        
    sent = []
    doc = spacy_en(text)
    for word in doc:
        if word.lemma_ == "-PRON-":
            sent.append(word.text)
        else:
            sent.append(word.lemma_)
    return " ".join(sent)

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

### Reading SST1 Datasate

In [12]:
sst1_dataset = pd.read_pickle('dataset/SST1_capado.pkl')
sst1_dataset.tail()

Unnamed: 0,sentence,label,split
72115,the problem with concept films is that if the ...,1,test
72116,"safe conduct , however ambitious and well inte...",1,test
72117,"a film made with as little wit , interest , an...",1,test
72118,to enjoy this movie 's sharp dialogue and deli...,2,test
72119,"but here 's the real damn it is n't funny , ei...",0,test


In [13]:
## Number of sentences per label
for i in list(set(sst1_dataset.label)):
    print("LABEL {} - PHRASES {}".format(i,len(sst1_dataset[sst1_dataset.label==i])))

LABEL 0 - PHRASES 5649
LABEL 4 - PHRASES 6622
LABEL 2 - PHRASES 24550
LABEL 1 - PHRASES 17141
LABEL 3 - PHRASES 18158


In [14]:
## ELMo
elmo_weights_key_path = '../vectors/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
elmo_config_key_path = '../vectors/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'

## S3 Configs for SageMaker
#bucket = 'adp-e-ml-notebooks-sagemaker'             
#prefix = 'vectors'   
#elmo_weights_key = '{}/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'.format(prefix)
#elmo_weights_key_path = os.path.join('s3://', bucket, elmo_weights_key)
#elmo_config_key = '{}/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'.format(prefix)
#elmo_config_key_path = os.path.join('s3://', bucket, elmo_config_key)

### Elmo Instance
elmo = Elmo(elmo_config_key_path, 
            elmo_weights_key_path, 
            num_output_representations = 1, 
            dropout=0.3,
            requires_grad = False)
if torch.cuda.is_available():
    elmo = elmo.cuda()

02/09/2019 19:17:42 - INFO - allennlp.modules.elmo -   Initializing ELMo


In [15]:
def get_elmo(sent):
    elmo.eval()
    sent = [sent.split()]
    character_ids = batch_to_ids(sent)
    if torch.cuda.is_available():
        character_ids = character_ids.cuda()
    embeddings = elmo(character_ids)
    rep = embeddings['elmo_representations'][0]
    rep = rep.squeeze(dim=0)
    avg = rep.mean(dim=0)
    avg = avg/torch.norm(avg)
    return avg

In [16]:
get_elmo("what is love baby don't hurt me").mean()

tensor(0.0009, device='cuda:0', grad_fn=<MeanBackward0>)

In [17]:
get_elmo("what is").mean()

tensor(-0.0002, device='cuda:0', grad_fn=<MeanBackward0>)

In [18]:
sst1_dataset['clean_text']=sst1_dataset['sentence'].apply(lambda x: transformText(x))
sst1_dataset['label'] = sst1_dataset['label'].apply(lambda x: int(x))

In [19]:
sst1_dataset.tail()

Unnamed: 0,sentence,label,split,clean_text
72115,the problem with concept films is that if the ...,1,test,the problem with concept films is that if the ...
72116,"safe conduct , however ambitious and well inte...",1,test,safe conduct however ambitious and well intent...
72117,"a film made with as little wit , interest , an...",1,test,a film made with as little wit interest and pr...
72118,to enjoy this movie 's sharp dialogue and deli...,2,test,to enjoy this movie s sharp dialogue and delig...
72119,"but here 's the real damn it is n't funny , ei...",0,test,but here s the real damn it is n t funny either


In [20]:
train_dataset = sst1_dataset[sst1_dataset.split == 'train'].reset_index(drop=True)
test_dataset = sst1_dataset[sst1_dataset.split == 'test'].reset_index(drop=True)

In [21]:
len(train_dataset), len(test_dataset)

(68916, 2140)

In [70]:
## Pre-compute ELMo dataset
train_elmos = np.zeros((len(train_dataset), elmo.get_output_dim()))
for i in range(len(train_dataset)):
    elmo_phrase = get_elmo(train_dataset.loc[i]['clean_text']).detach().cpu().numpy()
    train_elmos[i]= elmo_phrase

In [71]:
train_elmos.shape

(68916, 1024)

In [83]:
train_labels = np.array(train_dataset.label)
train_labels.shape

(68916,)

In [85]:
with open("dataset/train_elmos","wb") as f:
    pickle.dump((train_elmos,train_labels),f, protocol = pickle.HIGHEST_PROTOCOL)

In [88]:
#with open("dataset/train_elmos","rb") as f:
#    tr, lb = pickle.load(f)

In [92]:
#tr.shape, lb.shape

In [77]:
test_elmos = np.zeros((len(test_dataset), elmo.get_output_dim()))
for i in range(len(test_dataset)):
    elmo_phrase = get_elmo(test_dataset.loc[i]['clean_text']).detach().cpu().numpy()
    test_elmos[i]= elmo_phrase

In [78]:
test_elmos.shape

(2140, 1024)

In [94]:
test_labels = np.array(test_dataset.label)
test_labels.shape

(2140,)

In [95]:
with open("dataset/test_elmos","wb") as f:
    pickle.dump((test_elmos,test_labels),f, protocol = pickle.HIGHEST_PROTOCOL)

In [99]:
#with open("dataset/test_elmos","rb") as f:
#    tr, lb = pickle.load(f)

In [100]:
#tr.shape, lb.shape

In [102]:
!ls -lah dataset/

total 1.2G
drwxrwxr-x 3 roberto roberto 4.0K Feb  9 01:33 .
drwxrwxr-x 5 roberto roberto 4.0K Feb  9 01:33 ..
drwxrwxr-x 9 roberto roberto 4.0K Feb  5 21:24 2017-06-custom-intent-engines
-rw-rw-r-- 1 roberto roberto 5.4K Feb  5 21:24 intents_test_roberto.pkl
-rw-rw-r-- 1 roberto roberto 133K Feb  5 21:24 intents_train_roberto.csv
-rw-rw-r-- 1 roberto roberto 564M Feb  8 23:58 pre_computed_dataset
-rw-rw-r-- 1 roberto roberto 238K Feb  5 21:24 snips_dataset.csv
-rw-rw-r-- 1 roberto roberto  86K Feb  5 22:22 snips_sliced_test.pkl
-rw-rw-r-- 1 roberto roberto  43K Feb  5 22:21 snips_sliced_train.pkl
-rw-rw-r-- 1 roberto roberto 6.1M Feb  7 23:30 SST1_capado.csv
-rw-rw-r-- 1 roberto roberto 6.4M Feb  6 19:58 SST1_capado.pkl
-rw-rw-r-- 1 roberto roberto 8.4M Feb  6 19:58 SST1.pkl
-rw-rw-r-- 1 roberto roberto 4.3M Feb  7 23:31 SST2_capado.csv
-rw-rw-r-- 1 roberto roberto 4.5M Feb  6 19:58 SST2_capado.pkl
-rw-rw-r-- 1 roberto roberto 5.1M Feb  6 19:58 SST2.pkl
-rw-rw-r-- 1 rob

In [103]:
!mv dataset/test_elmos dataset/test_elmos.pkl

In [105]:
!mv dataset/train_elmos dataset/train_elmos.pkl

In [107]:
#!ls -lah dataset/

## Train/Test Split

In [23]:
from sklearn.model_selection import train_test_split

In [109]:
x_train, x_valid, y_train, y_valid = train_test_split(train_elmos, train_labels, test_size=0.20, random_state=42)

In [110]:
x_train.shape, y_train.shape

((55132, 1024), (55132,))

In [22]:
## Pruning dataset to have few instances per class for augmentation evaluation
pruned_num_phrases = 1000
train_pruned = pd.DataFrame(columns = ['sentence', 'label','split'])
valid_pruned = pd.DataFrame(columns = ['sentence', 'label','split'])

In [25]:
for i in list(set(train_dataset.label)):
    print("LABEL {} ---------------------".format(i))
    train_ = train_dataset[train_dataset.label==i][0:pruned_num_phrases]
    valid_ = train_dataset[train_dataset.label==i][pruned_num_phrases:]
    print("# train {}, # valid {}".format(len(train_), len(valid_)))
    train_pruned = pd.concat([train_pruned,train_]).reset_index(drop=True)
    valid_pruned = pd.concat([valid_pruned,valid_]).reset_index(drop=True)

LABEL 0 ---------------------
# train 1000, # valid 4240
LABEL 1 ---------------------
# train 1000, # valid 15248
LABEL 2 ---------------------
# train 1000, # valid 22973
LABEL 3 ---------------------
# train 1000, # valid 16383
LABEL 4 ---------------------
# train 1000, # valid 5072


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [26]:
# Clean Dataset (only stopword removal, punct, ascii - no lemma, stemm)
train_pruned['clean_text']=train_pruned['sentence'].apply(lambda x: transformText(x))
valid_pruned['clean_text']=valid_pruned['sentence'].apply(lambda x: transformText(x))
test_dataset['clean_text']=valid_pruned['sentence'].apply(lambda x: transformText(x))

In [114]:
class IntentsPrecomp(Dataset):
    def __init__(self, X, Y):
        self.len = len(X)
        self.data = X
        self.label = Y
        
    def __getitem__(self, index):
        X = torch.tensor(self.data[index], dtype=torch.float32)
        y = torch.tensor(self.label[index], dtype=torch.int64)
        return X, y
    
    def __len__(self):
        return self.len

In [115]:
len(x_train), len(y_train)

(55132, 55132)

In [116]:
training_set = IntentsPrecomp(x_train, y_train)
validing_set = IntentsPrecomp(x_valid, y_valid)

In [117]:
training_set.__getitem__(0), training_set.__getitem__(0)

((tensor([-0.0111,  0.0017,  0.0153,  ...,  0.0377,  0.0142, -0.0148]),
  tensor(3)),
 (tensor([-0.0111,  0.0017,  0.0153,  ...,  0.0377,  0.0142, -0.0148]),
  tensor(3)))

In [118]:
#class Intents(Dataset):
#    def __init__(self, dataframe):
#        self.len = len(dataframe)
#        self.data = dataframe
#        
#    def __getitem__(self, index):
#        phrase = self.data.clean_text[index]
#        X = get_elmo(phrase)
#        y = self.data.label[index]
#        return X, y
#    
#    def __len__(self):
#        return self.len

In [119]:
#training_set = Intents(train_pruned)
#validing_set = Intents(valid_pruned)

In [120]:
#training_set.__len__(), validing_set.__len__()

In [121]:
#training_set.__getitem__(1)

In [122]:
#validing_set.__getitem__(400)

## Simple MLP Classifier

In [123]:
class SimpleMLP(nn.Module):
    def __init__(self, inputdim, 
                        nclasses, 
                        nhidden, 
                        dropout = 0):
        super(SimpleMLP, self).__init__()
        """
        PARAMETERS:
        -dropout:    dropout for MLP
        """
        
        self.inputdim = inputdim
        self.hidden_dim = nhidden
        self.dropout = dropout
        self.nclasses = nclasses
        self.model = nn.Sequential(
            nn.Linear(self.inputdim, nhidden),
            nn.Dropout(p=self.dropout),
            nn.ReLU(),
            nn.Linear(nhidden, self.nclasses),
            )
        if torch.cuda.is_available():
            self.model = self.model.cuda()
    def forward(self, x):
        log_probs = self.model(x)
        return log_probs

In [178]:
INP_DIM = elmo.get_output_dim()
NUM_LABELS = len(set(labels))
NHIDDEN = 2048
DROPOUT = 0

In [179]:
model = SimpleMLP(inputdim = INP_DIM ,
              nhidden = NHIDDEN,
              nclasses = NUM_LABELS,
              dropout = DROPOUT)

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    model = model.to(device)
    model.cuda()

In [180]:
samp = training_set.__getitem__(0)[0].cuda()
model.forward(samp)

tensor([ 0.0171, -0.0146,  0.0098,  0.0091,  0.0103], device='cuda:0',
       grad_fn=<AddBackward0>)

## Training

In [181]:
# Dataloaders Parameters
params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 0}
max_epochs = 30

train_loader = DataLoader(training_set, **params)
valid_loader = DataLoader(validing_set, **params)
# Hyperparams
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.0001 
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [183]:
for epoch in range(max_epochs):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(train_loader):
        optimizer.zero_grad() 
        if torch.cuda.is_available():
            sent = sent.cuda()
            label = label.cuda()
        output = model.forward(sent)
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        if i%1000 == 0:      
            correct = 0
            total = 0
            for sent, label in valid_loader:      
                if torch.cuda.is_available():
                    sent = sent.cuda()
                    label = label.cuda()
                output = model.forward(sent)
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('LOSS: {}. VALID ACCURACY: {}%'.format(loss.data, accuracy))

EPOCH -- 0
LOSS: 0.8674242496490479. VALID ACCURACY: 58.13987231572838%
EPOCH -- 1
LOSS: 0.9393641948699951. VALID ACCURACY: 58.35026117237377%
EPOCH -- 2
LOSS: 0.7289527058601379. VALID ACCURACY: 58.06006964596634%
EPOCH -- 3
LOSS: 0.8033251762390137. VALID ACCURACY: 58.248694138131164%
EPOCH -- 4
LOSS: 0.7279648184776306. VALID ACCURACY: 58.32124201973302%
EPOCH -- 5
LOSS: 0.8410013318061829. VALID ACCURACY: 58.45182820661637%
EPOCH -- 6
LOSS: 0.6902561187744141. VALID ACCURACY: 58.30673244341265%
EPOCH -- 7
LOSS: 0.590100109577179. VALID ACCURACY: 58.29947765525247%
EPOCH -- 8
LOSS: 0.6186671257019043. VALID ACCURACY: 58.19065583284968%
EPOCH -- 9
LOSS: 0.6990066766738892. VALID ACCURACY: 58.37928032501451%
EPOCH -- 10
LOSS: 0.7731107473373413. VALID ACCURACY: 58.858096343586766%
EPOCH -- 11
LOSS: 0.6149803400039673. VALID ACCURACY: 59.00319210679048%
EPOCH -- 12
LOSS: 0.5969610214233398. VALID ACCURACY: 58.57515960533952%
EPOCH -- 13
LOSS: 0.6161081790924072. VALID ACCURACY: 58.654

In [184]:
#def get_reply(phrase, model):
#    x = get_elmo(phrase)
#    logits_out = model.forward(x)
#    softmax_out = F.softmax(logits_out, dim=0).cpu()
#    _, pred_label = torch.max(softmax_out.data, 0)
#    return int(pred_label.cpu().numpy())

In [185]:
#get_reply("testing", model)

In [186]:
#get_reply("what is love", model)

## Checking test error

In [187]:
correct = 0
total = 0
for i, utt in enumerate(test_elmos):
    X = torch.tensor(utt, dtype=torch.float32).cuda()
    y = torch.tensor(test_labels[i], dtype=torch.int64)
    logits_out = model(X)
    softmax_out = F.softmax(logits_out, dim=0).cpu()
    _, pred_label = torch.max(softmax_out.data, 0)
    total +=1
    if pred_label == y:
        correct += 1
test_accuracy = 100.00 * correct / total
print("TEST ACCURACY  -- {}".format(test_accuracy))

TEST ACCURACY  -- 44.06542056074766


In [177]:
#correct = 0
#total = 0
#errors = []
#for i in range(len(test_dataset)):
#    msg = str(test_dataset['clean_text'][i])
#    lbl = test_dataset['label'][i]
#    prediction = get_reply(msg, model)
#    #print(prediction)
#    total +=1
#    if prediction == lbl:
#        correct += 1
#    else:
#        errors.append((msg ,lbl))
#test_accuracy = 100.00 * correct / total
#print("TEST ACCURACY  -- {}".format(test_accuracy))

## Implementing Augmentation Class

In [6]:
%matplotlib inline
from torch.nn.functional import interpolate
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [29]:
def get_linear_interpolation(label, num_interp_samples, return_all_points=False):
    sentences= list(train_pruned.clean_text[train_dataset.label == label])
    points = np.zeros((elmo.get_output_dim(),len(sentences)))    
    
    for i,utt in enumerate(sentences):
        points[:,i] = get_elmo(utt).detach().cpu().clone().numpy()
    point = torch.tensor(points)
    x = point.unsqueeze(dim=0) 
    print(x.shape)
    
    ## Random selector for which interpolated phrase to pick
    rand_phrase = np.random.randint(num_interp_samples, size = 1)
    
    ## Interpolate phrases
    interp = interpolate(x, size=(num_interp_samples), mode='linear', align_corners=True).squeeze(0).numpy().T
    
    if return_all_points == False:
        interp =interp[rand_phrase].squeeze(0)      ## Pick Randomly 1 point sample
    return interp

In [30]:
get_linear_interpolation(1 , num_interp_samples = 20).shape

torch.Size([1, 1024, 415])


(1024,)

In [None]:
get_linear_interpolation(2, num_interp_samples = 20, return_all_points=True).shape

In [None]:
def plot_manifold(intents, num_interp_samples, perplexity = 3):
    
    list_intents = {}
    for name in intents:
        phrases = list(train_pruned.clean_text[train_pruned.label == name])
        list_intents.update({name:phrases})
    
    ## List all sentences
    sentences = [item for sublist in list(list_intents.values()) for item in sublist]
    embeddings_np = np.zeros((len(sentences), elmo.get_output_dim()))
    for i,sent in enumerate(sentences):
        embeddings_np[i]= get_elmo(sent).detach().cpu().numpy()
    initial_sent_size = len(sentences)
    
    ## Get syntetic phrases
    for name in list_intents:
        interp = get_linear_interpolation(name, num_interp_samples, return_all_points=True)
        embeddings_np = np.vstack((embeddings_np,interp))
        syntetic_label = "--- interp_{} ---".format(name)
        ## initial sizes
        for i in range(num_interp_samples):
            sentences.append(syntetic_label)

    ## Do TSNE and plot
    tsne=TSNE(n_components=2,perplexity = perplexity, method ='exact',verbose=1)
    sentences_tsne = tsne.fit_transform(embeddings_np)
    plt.subplots(figsize=(30, 15))
    plt.grid()
    
    ## Grouping points by indexes
    points = (sentences_tsne[:initial_sent_size,0], sentences_tsne[:initial_sent_size,1])
    syntetic_points = (sentences_tsne[initial_sent_size:,0], sentences_tsne[initial_sent_size:,1])
    
    ## Ploting points
    plt.scatter(points[0], points[1], c='r', marker='o')
    plt.scatter(syntetic_points[0], syntetic_points[1], c='b', marker='x')

    ## Adding labels
    for label, x, y in zip(sentences[0:initial_sent_size], points[0], points[1]):
        font = {'size' : 10, 'weight' : 'normal'}
        plt.rc('font', **font)
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')

    for label, x, y in zip(sentences[initial_sent_size:], syntetic_points[0], syntetic_points[1]):
        font = {'size' : 18, 'weight' : 'bold'}
        plt.rc('font', **font)
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')



In [None]:
plot_manifold([1,2,3], 3, perplexity = 3)

In [None]:
class AugmentedIntents(Dataset):
    def __init__(self, dataframe, alpha, num_samples = 20, augmentation = True):
        self.len = len(dataframe)
        self.data = dataframe
        self.augmentation = augmentation
        self.num_samples = num_samples
        self.alpha = alpha                          ## probabilty to get a real phrase vs interpolated phrase
        
    def __getitem__(self, index):
        phrase = self.data.clean_text[index]
        y = self.data.label[index]
        if self.augmentation:
            proba = np.random.binomial(1, self.alpha )
            if proba == 1:
                #print("frase normal")
                X = get_elmo(phrase)
            elif proba == 0:
                #print("frase interpolada")
                X = torch.tensor(get_linear_interpolation(y, num_interp_samples = self.num_samples), dtype=torch.float32)
                if torch.cuda.is_available():
                    X = X.cuda()
        else:
            X = get_elmo(phrase)
        return X, y
    
    def __len__(self):
        return self.len

In [None]:
alpha = 0.5
num_samples = 5
training_set = AugmentedIntents(train_pruned, alpha, num_samples, augmentation = True)
validing_set = AugmentedIntents(valid_pruned,  alpha, num_samples, augmentation = False)

In [None]:
n = training_set.__getitem__(1)

In [None]:
n[0].shape

In [None]:
model_2 = SimpleMLP(inputdim = INP_DIM ,
              nhidden = NHIDDEN,
              nclasses = NUM_LABELS,
              dropout = DROPOUT)

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    model_2 = model_2.to(device)
    model_2.cuda()

In [None]:
train_loader = DataLoader(training_set, **params)
valid_loader = DataLoader(validing_set, **params)
# Hyperparams
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001 
optimizer = optim.Adam(params =  model_2.parameters(), lr=learning_rate)

In [None]:
for epoch in range(max_epochs):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(train_loader):        
        optimizer.zero_grad()
        if torch.cuda.is_available():
            sent = sent.cuda()
            label = label.cuda()
        output = model_2.forward(sent)
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in valid_loader:
                if torch.cuda.is_available():
                    sent = sent.cuda()
                    label = label.cuda()
                output = model_2.forward(sent)
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('LOSS: {}. VALID ACCURACY: {}%'.format(loss.data, accuracy))

In [None]:
test_dataset

In [None]:
correct = 0
total = 0
errors = []
for i in range(len(test_dataset)):
    msg = str(test_dataset['clean_text'][i])
    lbl = test_dataset['label'][i]
    prediction = get_reply(msg, model_2)
    #print(prediction)
    total +=1
    if prediction == lbl:
        correct += 1
    else:
        errors.append((msg ,lbl))
test_accuracy = 100.00 * correct / total
print("TEST ACCURACY  -- {}".format(test_accuracy))

### Results

- Linear

| Alpha (Real/Sintetic) | Num Samples Interpolated | Valid Accuracy % | Test Accuracy % | Epochs | Mini-Batch Size |
| --- | --- | --- | --- | -- | -- | 
| <font color='red'> 1 (no augmentation) </font> |  -  | <font color='red'> -  </font>| <font color='red'> -  </font> |  30  | 32 |
| 0.5 | 5  | - | - |  30  | 32 |
| 0.5 | 10 | - | - |  30  | 32 |
| 0.5 | 30 | - | - |  30  | 32 |
| 0.5 | 50 | - | - | 30    | 32 |
| 0.7 | 5  | - | - | 30    | 32  |
| 0.7 | 10 | - | - | 30    | 32  |
| 0.7 | 30 | - | - | 30    | 32  |
| 0.7 | 50 | - | - | 30    | 32  |
| 0.9 | 5  | - | - | 30    | 32  |
| 0.9 | 10 | - | - | 30    | 32  |
| 0.9 | 30 | - | - | 30  | 32  |
| 0.9 | 50 | - | - | 30  |  32 | 


- Nearest

| Alpha (Real/Sintetic) | Num Samples Interpolated | Valid Accuracy % | Test Accuracy % | Epochs | Mini-Batch Size |
| --- | --- | --- | --- | -- | -- | 
| <font color='red'> 1 (no augmentation) </font> |  -  | <font color='red'> -  </font>| <font color='red'> -  </font> |  30  | 32 |
| 0.5 | 5  | - | - |  30  | 32 |
| 0.5 | 10 | - | - |  30  | 32 |
| 0.5 | 30 | - | - |  30  | 32 |
| 0.5 | 50 | - | - | 30    | 32 |
| 0.7 | 5  | - | - | 30    | 32  |
| 0.7 | 10 | - | - | 30    | 32  |
| 0.7 | 30 | - | - | 30    | 32  |
| 0.7 | 50 | - | - | 30    | 32  |
| 0.9 | 5  | - | - | 30    | 32  |
| 0.9 | 10 | - | - | 30    | 32  |
| 0.9 | 30 | - | - | 30  | 32  |
| 0.9 | 50 | - | - | 30  |  32 | 

- _Mixup_

| Alpha (Real/Sintetic) | Num Samples Interpolated | Valid Accuracy % | Test Accuracy % | Epochs | Mini-Batch Size |
| --- | --- | --- | --- | -- | -- | 
| <font color='red'> 1 (no augmentation) </font> |  -  | <font color='red'> -  </font>| <font color='red'> -  </font> |  30  | 32 |
| 0.5 | 5  | - | - |  30  | 32 |
| 0.5 | 10 | - | - |  30  | 32 |
| 0.5 | 30 | - | - |  30  | 32 |
| 0.5 | 50 | - | - | 30    | 32 |
| 0.7 | 5  | - | - | 30    | 32  |
| 0.7 | 10 | - | - | 30    | 32  |
| 0.7 | 30 | - | - | 30    | 32  |
| 0.7 | 50 | - | - | 30    | 32  |
| 0.9 | 5  | - | - | 30    | 32  |
| 0.9 | 10 | - | - | 30    | 32  |
| 0.9 | 30 | - | - | 30  | 32  |
| 0.9 | 50 | - | - | 30  |  32 | 