# LSTM RNN

This is an implementation of the LSTM RNN from Bowman et al. (https://nlp.stanford.edu/pubs/snli_paper.pdf) for text classification on the SST dataset and Textual entailment on the SNLI dataset. 

## Initialzation

Required packages and helper functions.

In [40]:
import numpy as np

from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
from torch.nn import Linear
from torch.nn.functional import softmax, relu, tanh
from torchtext.vocab import Vectors, GloVe, CharNGram, FastText
from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import pad_packed_sequence as unpack

from sklearn.manifold import TSNE

from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import output_notebook, show, push_notebook
output_notebook()

In [41]:
use_cuda = torch.cuda.is_available()

def get_variable(x):
    """ Converts tensors to cuda, if available. """
    if use_cuda:
        return x.cuda()
    return x

def get_numpy(x):
    """ Get numpy array for both cuda and not. """
    if use_cuda:
        return x.cpu().data.numpy()
    return x.data.numpy()

def construct_sentences(batch):
    """    
    Parameters
    ----------
    batch: torchtext.data.batch.Batch
    
    Returns
    -------
    [str]
    """
    return [" ".join([TEXT.vocab.itos[elm] 
                      for elm in get_numpy(batch.text[:,i])])
            for i in range(batch.text.size()[1])]

def get_labels(batch):
    """
    Parameters
    ----------
    batch: torchtext.data.batch.Batch
    
    Returns
    -------
    [str]
    """
    return [LABEL.vocab.itos[get_numpy(batch.label[i])] for i in range(len(batch.label))]

def accuracy(ys, ts):
    correct_prediction = torch.eq(torch.max(ys, 1)[1], ts)
    return torch.mean(correct_prediction.float())

## SST
### Data creation

In [42]:
TEXT = data.Field(sequential=True, include_lengths = True)
LABEL = data.Field(sequential=False)

train_set, validation_set, test_set = datasets.SST.splits(TEXT,
                                                          LABEL,
                                                          fine_grained=False,
                                                          train_subtrees=True,
                                                          filter_pred=lambda ex: ex.label != 'neutral')

In [43]:
# build the vocabulary
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.build_vocab(train_set, max_size=None, vectors=Vectors('wiki.simple.vec', url=url))
LABEL.build_vocab(train_set)
# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

# make iterator for splits
train_iter, val_iter, test_iter = data.Iterator.splits(
    (train_set, validation_set, test_set), batch_size=3, sort_key=lambda x: len(x.text),sort_within_batch=True, repeat=False)

len(TEXT.vocab) 18005
TEXT.vocab.vectors.size() torch.Size([18005, 300])


### Model framework

In [44]:
embedding_dim = TEXT.vocab.vectors.size()[1]
num_embeddings = TEXT.vocab.vectors.size()[0]
num_classes = len(LABEL.vocab.itos)

dropout_rate = 0.2
input_dim = 100
con_dim = 200

# build the LSTM model
class LSTMNet(nn.Module):

    def __init__(self):
        super(LSTMNet, self).__init__()
        self.embeddings = nn.Embedding(num_embeddings, embedding_dim)
        # use pretrained embeddings
        self.embeddings.weight.data.copy_(TEXT.vocab.vectors)

        # Simple RNN LSTM Layer
        self.lstm_input = nn.LSTM(input_size = embedding_dim,
                                  hidden_size = embedding_dim,
                                  num_layers = 1)
        
        # Linear layer (with tanh activation) for mapping to lower dimensions
        self.input = Linear(in_features = embedding_dim,
                             out_features = input_dim,
                             bias = False)
        
        # Three stacked linear layers (with tanh activation)
        self.l_1 = Linear(in_features=con_dim,
                           out_features=con_dim,
                           bias = False)
        self.l_2 = Linear(in_features=con_dim,
                           out_features=con_dim,
                           bias=False)
        self.l_3 = Linear(in_features=con_dim,
                           out_features = con_dim,
                           bias = False)
        
        
        # Applied dropout
        self.drop = nn.Dropout(p = dropout_rate)
        
        # Putput layer
        self.l_out = Linear(in_features=con_dim,
                            out_features=num_classes,
                            bias=False)
        
        
    def forward(self, x, y):
        out = {}
        
        # input = (text_tensor, seq_length_tensor)
        x_text = x[0] #Text of input
        y_text = y[0]
        x_len = x[1] #Sequence length of input
        y_len = y[1]
        
        # Get embeddings
        x = self.embeddings(x_text) # (batch size, length, embedding dim)
        y = self.embeddings(y_text)
        
        # Applied dropout
        x = self.drop(x)
        y = self.drop(y)

        # Packing padded sequences to max_length
        packed_x = pack(x,x_len.view(-1).tolist(), batch_first = False)
        packed_y = pack(y,y_len.view(-1).tolist(), batch_first = False)
        
        
        # LSTM RNN Layer
        x, ht_x0 = self.lstm_input(packed_x)
        y, ht_y0 = self.lstm_input(packed_y)
        
        # Unpacking packed tensors
        unpacked_x, unpacked_len = unpack(x, batch_first = False)
        unpacked_y, unpacked_len = unpack(y, batch_first = False)
        xt = unpacked_x
        yt = unpacked_y

        # Mapping input from 300 dim to 100 dim and concatenating
        xt = torch.tanh(self.input(xt))
        yt = torch.tanh(self.input(yt))
        z = torch.cat((xt,yt),2)
        
        # Three stacked tanh layers
        z = torch.tanh(self.l_1(z))     
        z = torch.tanh(self.l_2(z))
        z = torch.tanh(self.l_3(z))
 
        # Softmax
        out['out'] = softmax(self.l_out(z[0]), 1)
        return out

net = LSTMNet()
if use_cuda:
    net.cuda()
print(net)

LSTMNet(
  (embeddings): Embedding(18005, 300)
  (lstm_input): LSTM(300, 300)
  (input): Linear(in_features=300, out_features=100, bias=False)
  (l_1): Linear(in_features=200, out_features=200, bias=False)
  (l_2): Linear(in_features=200, out_features=200, bias=False)
  (l_3): Linear(in_features=200, out_features=200, bias=False)
  (drop): Dropout(p=0.2)
  (l_out): Linear(in_features=200, out_features=3, bias=False)
)


### Training the model

In [46]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(net.parameters(), lr=0.1,weight_decay=0.001)

max_iter = 2000
eval_every = 500
log_every = 200


train_loss, train_accs = [], []

net.train()
for i, batch in enumerate(train_iter):
    if i % eval_every == 0:
        net.eval()
        val_losses, val_accs, val_lengths = 0, 0, 0
      #  val_meta = {'label_idx': [], 'sentences': [], 'labels': []}
        for val_batch in val_iter:
            output = net(val_batch.text,val_batch.text)
            # batches sizes might vary, which is why we cannot just mean the batch's loss
            # we multiply the loss and accuracies with the batch's size,
            # to later divide by the total size
            #print(val_batch.label.size())
          #  print(val_batch.label.size())
            val_losses += criterion(output['out'], val_batch.label) * val_batch.batch_size
            val_accs += accuracy(output['out'], val_batch.label) * val_batch.batch_size
            val_lengths += val_batch.batch_size
            
       #     for key, _val in output.items():
       #         if key not in val_meta:
       #             val_meta[key] = []
       #         val_meta[key].append(get_numpy(_val)) 
       #     val_meta['label_idx'].append(get_numpy(val_batch.label))
       #     val_meta['sentences'].append(construct_sentences(val_batch))
       #     val_meta['labels'].append(get_labels(val_batch))
        
       # for key, _val in val_meta.items():
       #     val_meta[key] = np.concatenate(_val)
        
        # divide by the total accumulated batch sizes
        val_losses /= val_lengths
        val_accs /= val_lengths
        
        print("valid, it: {} loss: {:.2f} accs: {:.2f}\n".format(i, get_numpy(val_losses), get_numpy(val_accs)))
        #update_plot(val_meta, 'bow', tsne_plot)
        
        net.train()
    
    output = net(batchsst.text,batchsst.text)
    batch_loss = criterion(output['out'], batchsst.label)
    
    train_loss.append(get_numpy(batch_loss))
    train_accs.append(get_numpy(accuracy(output['out'], batchsst.label)))
    
    optimizer.zero_grad()
    batch_loss.backward()
    optimizer.step()
    
    if i % log_every == 0:        
        print("train, it: {} loss: {:.2f} accs: {:.2f}".format(i, 
                                                               np.mean(train_loss), 
                                                               np.mean(train_accs)))
        # reset
        train_loss, train_accs = [], []
        
    if max_iter < i:
        break

  return Variable(arr, volatile=not train), lengths
  return Variable(arr, volatile=not train)


valid, it: 0 loss: 1.10 accs: 0.51

train, it: 0 loss: 1.09 accs: 0.67
train, it: 200 loss: 0.83 accs: 0.77
train, it: 400 loss: 0.55 accs: 1.00
valid, it: 500 loss: 1.07 accs: 0.51

train, it: 600 loss: 0.55 accs: 1.00
train, it: 800 loss: 0.55 accs: 1.00
valid, it: 1000 loss: 1.07 accs: 0.51

train, it: 1000 loss: 0.55 accs: 1.00
train, it: 1200 loss: 0.55 accs: 1.00
train, it: 1400 loss: 0.55 accs: 1.00
valid, it: 1500 loss: 1.07 accs: 0.51

train, it: 1600 loss: 0.55 accs: 1.00
train, it: 1800 loss: 0.55 accs: 1.00
valid, it: 2000 loss: 1.07 accs: 0.51

train, it: 2000 loss: 0.55 accs: 1.00


# SNLI

In [47]:
print("Run test on SNLI...")
TEXT = data.Field(sequential=True, include_lengths = True)
LABEL = data.Field(sequential=False)
TREE = datasets.snli.ShiftReduceField()

train, val, test = datasets.SNLI.splits(TEXT, LABEL, TREE)

print("Fields:", train.fields)
print("Number of examples:\n", len(train))
print("First Example instance:\n", vars(train[0]))

url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.build_vocab(train, max_size=None, vectors=[CharNGram(),Vectors('wiki.simple.vec', url=url)])
#TEXT.build_vocab(train,vectors=GloVe[name='840B',dim='300'])
LABEL.build_vocab(train)


train_iter, val_iter, test_iter = data.Iterator.splits(
    (train_set, validation_set, test_set), batch_size=3, sort_key=lambda x: len(x.premise),sort_within_batch=True, repeat=False)

#batch = next(iter(train_iter))
#print("Numericalize premises:\n", batch.premise)
#print("Numericalize hypotheses:\n", batch.hypothesis)
#print("Entailment labels:\n", batch.label)

print("Test iters function")
train_iter, val_iter, test_iter = datasets.SNLI.iters(batch_size=4, trees=True)

batch = next(iter(train_iter))
print("Numericalize premises:\n", batch.premise)
print("Numericalize hypotheses:\n", batch.hypothesis)
print("Entailment labels:\n", batch.label)

#val_iter_set = next(iter(val_iter))


Run test on SNLI...


FileNotFoundError: [Errno 2] No such file or directory: '.data\\snli\\snli_1.0\\snli_1.0_train.jsonl'

In [None]:
embedding_dim = TEXT.vocab.vectors.size()[1]
num_embeddings = TEXT.vocab.vectors.size()[0]
num_classes = len(LABEL.vocab.itos)

dropout_rate = 0.2
input_dim = 100
con_dim = 200


# build the LSTM model
class LSTMNet(nn.Module):

    def __init__(self):
        super(LSTMNet, self).__init__()
        self.embeddings = nn.Embedding(num_embeddings, embedding_dim)
        # use pretrained embeddings
        self.embeddings.weight.data.copy_(TEXT.vocab.vectors)

        # Simple RNN LSTM Layer
        self.lstm_input = nn.LSTM(input_size = embedding_dim,
                                  hidden_size = embedding_dim,
                                  num_layers = 1)
        
        # Linear layer (with tanh activation) for mapping to lower dimensions
        self.input = Linear(in_features = embedding_dim,
                             out_features = input_dim,
                             bias = False)
        
        # Three stacked linear layers (with tanh activation)
        self.l_1 = Linear(in_features=con_dim,
                           out_features=con_dim,
                           bias = False)
        self.l_2 = Linear(in_features=con_dim,
                           out_features=con_dim,
                           bias=False)
        self.l_3 = Linear(in_features=con_dim,
                           out_features = con_dim,
                           bias = False)
        
        
        # Applied dropout
        self.drop = nn.Dropout(p = dropout_rate)
        
        # Putput layer
        self.l_out = Linear(in_features=con_dim,
                            out_features=num_classes,
                            bias=False)
        
        
    def forward(self, x, y):
        out = {}
        
        # input = (text_tensor, seq_length_tensor)
        x_text = x[0] #Text of input
        y_text = y[0]
        x_len = x[1] #Sequence length of input
        y_len = y[1]
        
        # Get embeddings
        x = self.embeddings(x_text) # (batch size, length, embedding dim)
        y = self.embeddings(y_text)
        
        # Applied dropout
        x = self.drop(x)
        y = self.drop(y)

        # Packing padded sequences to max_length
        packed_x = pack(x,x_len.view(-1).tolist(), batch_first = False)
        packed_y = pack(y,y_len.view(-1).tolist(), batch_first = False)
        
        
        # LSTM RNN Layer
        x, ht_x0 = self.lstm_input(packed_x)
        y, ht_y0 = self.lstm_input(packed_y)
        
        # Unpacking packed tensors
        unpacked_x, unpacked_len = unpack(x, batch_first = False)
        unpacked_y, unpacked_len = unpack(y, batch_first = False)
        xt = unpacked_x
        yt = unpacked_y

        # Mapping input from 300 dim to 100 dim and concatenating
        xt = torch.tanh(self.input(xt))
        yt = torch.tanh(self.input(yt))
        z = torch.cat((xt,yt),2)
        
        # Three stacked tanh layers
        z = torch.tanh(self.l_1(z))     
        z = torch.tanh(self.l_2(z))
        z = torch.tanh(self.l_3(z))
 
        # Softmax
        out['out'] = softmax(self.l_out(z[0]), 1)
        return out

net = LSTMNet()
if use_cuda:
    net.cuda()
print(net)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(net.parameters(), lr=0.01,weight_decay=0.001)

def accuracy(ys, ts):
    # making a one-hot encoded vector of correct (1) and incorrect (0) predictions
    correct_prediction = torch.eq(torch.max(ys, 1)[1], ts)
    # averaging the one-hot encoded vector
    return torch.mean(correct_prediction.float())

In [None]:

max_iter = 2000
eval_every = 500
log_every = 200

# will be updated while iterating
#tsne_plot = show(p, notebook_handle=True)

train_loss, train_accs = [], []

net.train()
for i, batch in enumerate(train_iter):
    if i % eval_every == 0:
        net.eval()
        val_losses, val_accs, val_lengths = 0, 0, 0
       # val_meta = {'label_idx': [], 'sentences': [], 'labels': []}
        for val_batch in val_iter:
            output = net(val_batch.premise,val_batch.hypothesis)
            # batches sizes might vary, which is why we cannot just mean the batch's loss
            # we multiply the loss and accuracies with the batch's size,
            # to later divide by the total size
            #print(output['out'])
            #print(val_batch.label)
            val_losses += criterion(output['out'], val_batch.label) * val_batch.batch_size
            val_accs += accuracy(output['out'], val_batch.label) * val_batch.batch_size
            val_lengths += val_batch.batch_size
           # print(val_batch.batch_size)
            
        
        # divide by the total accumulated batch sizes
        val_losses /= val_lengths
        val_accs /= val_lengths
        
        print("valid, it: {} loss: {:.2f} accs: {:.2f}\n".format(i, get_numpy(val_losses), get_numpy(val_accs)))
        #update_plot(val_meta, 'bow', tsne_plot)
        
        net.train()
    
    output = net(batch.premise,batch.hypothesis)
    batch_loss = criterion(output['out'], batch.label)
    
    train_loss.append(get_numpy(batch_loss))
    train_accs.append(get_numpy(accuracy(output['out'], batch.label)))
    
    optimizer.zero_grad()
    batch_loss.backward()
    optimizer.step()
    
    if i % log_every == 0:        
        print("train, it: {} loss: {:.2f} accs: {:.2f}".format(i, 
                                                               np.mean(train_loss), 
                                                               np.mean(train_accs)))
        # reset
        train_loss, train_accs = [], []
        
    if max_iter < i:
        break