# BCN model

In [1]:
import numpy as np

from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
from torch.nn import Linear
from torch.nn.functional import softmax, relu, tanh
from torchtext.vocab import Vectors, GloVe, CharNGram, FastText

from sklearn.manifold import TSNE

from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import output_notebook, show, push_notebook
output_notebook()


In [2]:
use_cuda = torch.cuda.is_available()

def get_variable(x):
    """ Converts tensors to cuda, if available. """
    if use_cuda:
        return x.cuda()
    return x

def get_numpy(x):
    """ Get numpy array for both cuda and not. """
    if use_cuda:
        return x.cpu().data.numpy()
    return x.data.numpy()

## SST

In [3]:


TEXT = data.Field(sequential=True, include_lengths = True)
LABEL = data.Field(sequential=False)

train_set, validation_set, test_set = datasets.SST.splits(TEXT,
                                                          LABEL,
                                                          fine_grained=False,
                                                          train_subtrees=True,
                                                          filter_pred=lambda ex: ex.label != 'neutral')

In [4]:
print('train.fields', train_set.fields)
print('len(train)', len(train_set))
print('vars(train[0])', vars(train_set[0]))
print()
print('Example 2', vars(train_set[17]))

train.fields {'text': <torchtext.data.field.Field object at 0x00000211DAA2AE48>, 'label': <torchtext.data.field.Field object at 0x00000211DAA2AE80>}
len(train) 98794
vars(train[0]) {'text': ['The', 'Rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'Century', "'s", 'new', '``', 'Conan', "''", 'and', 'that', 'he', "'s", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'Arnold', 'Schwarzenegger', ',', 'Jean-Claud', 'Van', 'Damme', 'or', 'Steven', 'Segal', '.'], 'label': 'positive'}

Example 2 {'text': ['The', 'gorgeously', 'elaborate', 'continuation', 'of', '``', 'The', 'Lord', 'of', 'the', 'Rings', "''", 'trilogy', 'is', 'so', 'huge', 'that', 'a', 'column', 'of', 'words', 'can', 'not', 'adequately', 'describe', 'co-writer\\/director', 'Peter', 'Jackson', "'s", 'expanded', 'vision', 'of', 'J.R.R.', 'Tolkien', "'s", 'Middle-earth', '.'], 'label': 'positive'}


In [5]:
# build the vocabulary
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
#url = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
TEXT.build_vocab(train_set, max_size=None, vectors=Vectors('wiki.simple.vec', url=url))
#TEXT.build_vocab(train_set, vectors=Vectors('glove.840B.300d.txt',url = url))
LABEL.build_vocab(train_set)
# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())


len(TEXT.vocab) 18005
TEXT.vocab.vectors.size() torch.Size([18005, 300])


In [6]:
# make iterator for splits
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train_set, validation_set, test_set), batch_size=3)


# print batch information
batchsst = next(iter(train_iter))
print(batchsst.text.size())
print(batchsst.label)

torch.Size([8, 3])
tensor([1, 1, 1], device='cuda:0')


In [7]:
embedding_dim = TEXT.vocab.vectors.size()[1]
num_embeddings = TEXT.vocab.vectors.size()[0]
num_classes = len(LABEL.vocab.itos)

dropout_rate = 0.2

input_dim = 100

con_dim = 200

hidden_dim = 300

batch_size = 3

# build the BoW model
class BCNNet(nn.Module):

    def __init__(self):
        super(BCNNet, self).__init__()
        self.embeddings = nn.Embedding(num_embeddings, embedding_dim)
        # use pretrained embeddings
        #self.embeddings.weight.data.copy_(TEXT.vocab.vectors)
        
        # The ReLu activtion layer
        self.input = Linear(in_features = embedding_dim,
                            out_features = embedding_dim,
                             bias = False)
        
        # bilstm encoder
        self.bilstm_enc = nn.LSTM(input_size = embedding_dim,
                                  hidden_size = embedding_dim,
                                  batch_first = False,
                                  bidirectional = True)
       # self.hidenc_x = (Variable(torch.zeros(2, batch_size, hidden_dim)),
       #                  Variable(torch.zeros(2, batch_size, hidden_dim)))
       # self.hidenc_y = (Variable(torch.zeros(2, batch_size, hidden_dim)),
       #                  Variable(torch.zeros(2, batch_size, hidden_dim)))
        self.enc_h2l = Linear(in_features = hidden_dim*2,
                          out_features = num_classes,
                             bias = False)
        
        # bilstm integrator
        self.bilstm_int = nn.LSTM(input_size = embedding_dim*3*2,
                                  hidden_size = embedding_dim,
                                  batch_first = False,
                                  bidirectional = True)
       # self.hidint_x = (Variable(torch.zeros(2, batch_size, hidden_dim)),
       #                  Variable(torch.zeros(2, batch_size, hidden_dim)))
       # self.hidint_y = (Variable(torch.zeros(2, batch_size, hidden_dim)),
       #                  Variable(torch.zeros(2, batch_size, hidden_dim)))
        self.int_h2l = Linear(in_features = hidden_dim*2,
                              out_features = num_classes)
        
        #Pooling
        self.maxpool = nn.MaxPool1d(kernel_size = embedding_dim)
        self.avgpool = nn.AvgPool1d(kernel_size = embedding_dim)
        
        #Batchnorm
        #self.batchnorm = nn.BatchNorm1d()
                
        # maxout layer
        self.maxout = Linear(in_features = num_classes*2*2,
                                out_features = num_classes*2*2,
                                bias = False)
        self.out = Linear(in_features = num_classes*2*2,
                                out_features = num_classes,
                                
                                bias = False)
        
        # dropout
        self.drop = nn.Dropout(p = dropout_rate)
        
    def forward(self, x, y):
        out = {}
        
        # Embedding
        x = self.embeddings(x)
        y = self.embeddings(y)
        
        # Relu
        x = relu(self.input(x))
        y = relu(self.input(y))
     #   print(x.size())
        
        # Encoder
        # biLSTM
        x, hidenc_fx = self.bilstm_enc(x)
        y, hidenc_fy = self.bilstm_enc(y)
     #   print(x.size())
        X = (hidenc_fx[0]) # (2,3,300) 1,3,300
        X = torch.cat((X[0],X[1]),1) # 1, 3, 600
        Y = (hidenc_fy[0])
        Y = torch.cat((Y[0],Y[1]),1)
        X = X.view(1,X.size()[0],X.size()[1])
        Y = Y.view(1,Y.size()[0],Y.size()[1])
       # print(X.size())
        
        # Biattention
        # X & Y ?
        #dim 1 = len, dim 2 = dim, and they are swaped so we can multiply the matrices
        A = torch.matmul(X, Y.transpose(1,2)) # A = X * Y^T  x = bs, len, dim, y^T = bs,dim, len
        A_x = softmax(A) # A_x = softmax(A)
        A_y = softmax(A.transpose(1,2)) # A_y = softmax(A^T)
        C_x = torch.matmul(A_x.transpose(1,2), X) # C_x = A_x^T * X 
        C_y = torch.matmul(A_y.transpose(1,2), Y) # C_y = A_y^T * Y
        
        # Integrator
        # input for integrator bilstm
        x = torch.cat((X, X-C_y, torch.mul(X, C_y)), 2) # Concat([X; X-C_y; X.C_y])
        y = torch.cat((Y, Y-C_x, torch.mul(Y, C_x)), 2) # Concat([Y; Y-C_x; Y.C_x]
        # X_|y = biLSTM(%), Y|x = biLSTM(%)
        X_y, hidint_x  = self.bilstm_int(x)
        Y_x, hidint_y = self.bilstm_int(y)
        
        X_temp = hidint_x[0]
        X_temp = torch.cat((X_temp[0],X_temp[1]),1)
        hidint_x = X_temp.view(1,X_temp.size()[0],X_temp.size()[1])
        
        Y_temp = hidint_y[0]
        Y_temp = torch.cat((Y_temp[0],Y_temp[1]),1)
        hidint_y = Y_temp.view(1,Y_temp.size()[0],Y_temp.size()[1])
        
        
        
        # Pooling
        # x_self = ?
        # x_pool = Concat([max(X_|y); mean(X_|y); min(X_|y); x_self])
        x_maxpool = self.maxpool(hidint_x)
        x_meanpool = self.avgpool(hidint_x)
       # print(hidint_x[0].size())
       # print(X_y.size())
        
      #  param = list(X_y.parameters())
      #  print(param)
        
        x_minpool = hidint_x * -1
        x_minpool = self.maxpool(x_minpool)
        x_minpool = x_minpool * -1
        x_pool = torch.cat((x_maxpool, x_meanpool, x_minpool), 2)
        
        y_maxpool = self.maxpool(hidint_y)
        y_meanpool = self.avgpool(hidint_y)
        y_minpool = hidint_y * -1
        y_minpool = self.maxpool(y_minpool)
        y_minpool = y_minpool * -1
       # print(y_minpool.size())
        y_pool = torch.cat((y_maxpool, y_meanpool, y_minpool), 2)
        
        
       # print(x_pool.size())
       # print(y_pool.size())
        # y_self = ?
        # y_pool = Concat([max(Y_|x); mean(Y_|x); min(Y_|x); y_self])
      #  y_pool = torch.cat((torch.max(hidint_y[0], 0), torch.mean(hidint_y[0], 0), torch.min(hidint_y[0], 0)), 0)
        
        # Maxout layer
        # With dropout and batchnormilization
        # Joined representations is the concatination of x_pool and y_pool
        joined = torch.cat((x_pool, y_pool), 2)
        #print(joined.size())
        joined = torch.squeeze(joined,0)
        #print(joined.size())
        joined = self.drop(joined)
        #joined = torch.nn.BatchNorm1d(joined)
        
        # 1st Layer
       # print(joined.size())
        #print(joined)
        joined = self.maxout(joined)
        joined = self.drop(joined)
       # print(joined[0].size())
        #joined = torch.nn.BatchNorm1d(joined)
        #print(joined.size())
        
        # 2nd layer
        joined = self.maxout(joined)
        joined = self.drop(joined)
        #joined = torch.nn.BatchNorm1d(joined)
        
       # print(joined)
        
        # 3rd maxout layer will be output
        out['out'] = self.out(joined)
        return out

net = BCNNet()
if use_cuda:
    net.cuda()
print(net)

BCNNet(
  (embeddings): Embedding(18005, 300)
  (input): Linear(in_features=300, out_features=300, bias=False)
  (bilstm_enc): LSTM(300, 300, bidirectional=True)
  (enc_h2l): Linear(in_features=600, out_features=3, bias=False)
  (bilstm_int): LSTM(1800, 300, bidirectional=True)
  (int_h2l): Linear(in_features=600, out_features=3, bias=True)
  (maxpool): MaxPool1d(kernel_size=300, stride=300, padding=0, dilation=1, ceil_mode=False)
  (avgpool): AvgPool1d(kernel_size=(300,), stride=(300,), padding=(0,))
  (maxout): Linear(in_features=12, out_features=12, bias=False)
  (out): Linear(in_features=12, out_features=3, bias=False)
  (drop): Dropout(p=0.2)
)


In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(net.parameters(), lr=0.001,weight_decay=0.001)

def accuracy(ys, ts):
    # making a one-hot encoded vector of correct (1) and incorrect (0) predictions
    correct_prediction = torch.eq(torch.max(ys, 1)[1], ts)
    # averaging the one-hot encoded vector
    return torch.mean(correct_prediction.float())

In [9]:
def construct_sentences(batch):
    """    
    Parameters
    ----------
    batch: torchtext.data.batch.Batch
    
    Returns
    -------
    [str]
    """
    return [" ".join([TEXT.vocab.itos[elm] 
                      for elm in get_numpy(batch.text[:,i])])
            for i in range(batch.text.size()[1])]

def get_labels(batch):
    """
    Parameters
    ----------
    batch: torchtext.data.batch.Batch
    
    Returns
    -------
    [str]
    """
    return [LABEL.vocab.itos[get_numpy(batch.label[i])] for i in range(len(batch.label))]


In [21]:
#derp = next(iter(val_iter))
#print(derp.text[0])

## TRAINING BCN WITH SST

In [10]:
max_iter = 1000
eval_every = 500
log_every = 200

# will be updated while iterating
#tsne_plot = show(p, notebook_handle=True)

train_loss, train_accs = [], []

net.train()
for i, batch in enumerate(train_iter):
    if i % eval_every == 0:
        net.eval()
        val_losses, val_accs, val_lengths = 0, 0, 0
      #  val_meta = {'label_idx': [], 'sentences': [], 'labels': []}
        for val_batch in val_iter:
            output = net(val_batch.text,val_batch.text)
            # batches sizes might vary, which is why we cannot just mean the batch's loss
            # we multiply the loss and accuracies with the batch's size,
            # to later divide by the total size
            #print(val_batch.label.size())
          #  print(val_batch.label.size())
            val_losses += criterion(output['out'], val_batch.label) * val_batch.batch_size
            val_accs += accuracy(output['out'], val_batch.label) * val_batch.batch_size
            val_lengths += val_batch.batch_size
            
       #     for key, _val in output.items():
       #         if key not in val_meta:
       #             val_meta[key] = []
       #         val_meta[key].append(get_numpy(_val)) 
       #     val_meta['label_idx'].append(get_numpy(val_batch.label))
       #     val_meta['sentences'].append(construct_sentences(val_batch))
       #     val_meta['labels'].append(get_labels(val_batch))
        
       # for key, _val in val_meta.items():
       #     val_meta[key] = np.concatenate(_val)
        
        # divide by the total accumulated batch sizes
        val_losses /= val_lengths
        val_accs /= val_lengths
        
        print("valid, it: {} loss: {:.2f} accs: {:.2f}\n".format(i, get_numpy(val_losses), get_numpy(val_accs)))
        #update_plot(val_meta, 'bow', tsne_plot)
        
        net.train()
    
    output = net(batchsst.text,batchsst.text)
    batch_loss = criterion(output['out'], batchsst.label)
    
    train_loss.append(get_numpy(batch_loss))
    train_accs.append(get_numpy(accuracy(output['out'], batchsst.label)))
    
    optimizer.zero_grad()
    batch_loss.backward()
    optimizer.step()
    
    if i % log_every == 0:        
        print("train, it: {} loss: {:.2f} accs: {:.2f}".format(i, 
                                                               np.mean(train_loss), 
                                                               np.mean(train_accs)))
        # reset
        train_loss, train_accs = [], []
        
    if max_iter < i:
        break

  return Variable(arr, volatile=not train)


valid, it: 0 loss: 1.10 accs: 0.29

train, it: 0 loss: 1.11 accs: 0.00


RuntimeError: CUDA error: out of memory

## SNLI

In [None]:
print("Run test on SNLI...")
TEXT = datasets.snli.ParsedTextField()
LABEL = data.LabelField()
TREE = datasets.snli.ShiftReduceField()

train, val, test = datasets.SNLI.splits(TEXT, LABEL, TREE)

print("Fields:", train.fields)
print("Number of examples:\n", len(train))
print("First Example instance:\n", vars(train[0]))

url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.build_vocab(train, max_size=None, vectors=[CharNGram(),Vectors('wiki.simple.vec', url=url)])
#TEXT.build_vocab(train,vectors=GloVe[name='840B',dim='300'])
LABEL.build_vocab(train)


train_iter, val_iter, test_iter = data.Iterator.splits((train, val, test), batch_size=3)

#batch = next(iter(train_iter))
#print("Numericalize premises:\n", batch.premise)
#print("Numericalize hypotheses:\n", batch.hypothesis)
#print("Entailment labels:\n", batch.label)

print("Test iters function")
train_iter, val_iter, test_iter = datasets.SNLI.iters(batch_size=4, trees=True)

batch = next(iter(train_iter))
print("Numericalize premises:\n", batch.premise)
print("Numericalize hypotheses:\n", batch.hypothesis)
print("Entailment labels:\n", batch.label)

#val_iter_set = next(iter(val_iter))


In [None]:
#TEXT.build_vocab(train)
#LABEL.build_vocab(train)
print(TEXT.vocab.vectors.size()[0])
print(len(LABEL.vocab.itos))
#print(vars(TEXT.vocab))
#print(vars(LABEL.vocab))

In [None]:
embedding_dim = TEXT.vocab.vectors.size()[1]
num_embeddings = TEXT.vocab.vectors.size()[0]
num_classes = len(LABEL.vocab.itos)

dropout_rate = 0.5

input_dim = 100

con_dim = 200

# build the BoW model
class BoWNet(nn.Module):

    def __init__(self):
        super(BoWNet, self).__init__()
        self.embeddings = nn.Embedding(num_embeddings, embedding_dim)
        # use pretrained embeddings
        #self.embeddings.weight.data.copy_(TEXT.vocab.vectors)
        
        # add hidden layers
        # YOUR CODE HERE!
        
        self.input = Linear(in_features = embedding_dim,
                             out_features = input_dim,
                             bias = False)
        
        self.l_1 = Linear(in_features=con_dim,
                           out_features=con_dim,
                           bias = False)
        self.l_2 = Linear(in_features=con_dim,
                           out_features=con_dim,
                           bias=False)
        self.l_3 = Linear(in_features=con_dim,
                           out_features = con_dim,
                           bias = False)
        
        self.drop = nn.Dropout(p = dropout_rate)
        
        # output layer
        self.l_out = Linear(in_features=con_dim,
                            out_features=num_classes,
                            bias=False)
        
    def forward(self, x, y):
        out = {}
        # get embeddings
        x = self.embeddings(x) # (bs,len,300)
        y = self.embeddings(y)
        
        x = self.drop(x)
        y = self.drop(y)
        
        #sum_x =  # (bs,300) 
        
        sum_x = torch.sum(x,0)
        sum_y = torch.sum(y,0)
        
              
        #tanh # (bs,100)
        
        sum_x = torch.tanh(self.input(sum_x))
        sum_y = torch.tanh(self.input(sum_y))

    
        z = torch.cat((sum_x,sum_y),1)
        
        z = torch.tanh(self.l_1(z))     
        z = torch.tanh(self.l_2(z))
        z = torch.tanh(self.l_3(z))
       # print(z.size())
        
       # z = self.drop(z)

        # Softmax
        out['out'] = self.l_out(z)
        return out

net = BoWNet()
if use_cuda:
    net.cuda()
print(net)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(net.parameters(), lr=0.001,weight_decay=0.001)

def accuracy(ys, ts):
    # making a one-hot encoded vector of correct (1) and incorrect (0) predictions
    correct_prediction = torch.eq(torch.max(ys, 1)[1], ts)
    # averaging the one-hot encoded vector
    return torch.mean(correct_prediction.float())

In [None]:
max_iter = 5000
eval_every = 500
log_every = 200

# will be updated while iterating
#tsne_plot = show(p, notebook_handle=True)

train_loss, train_accs = [], []

net.train()
for i, batch in enumerate(train_iter):
    if i % eval_every == 0:
        net.eval()
        val_losses, val_accs, val_lengths = 0, 0, 0
       # val_meta = {'label_idx': [], 'sentences': [], 'labels': []}
        for val_batch in val_iter:
            output = net(val_batch.premise,val_batch.hypothesis)
            # batches sizes might vary, which is why we cannot just mean the batch's loss
            # we multiply the loss and accuracies with the batch's size,
            # to later divide by the total size
            #print(output['out'])
            #print(val_batch.label)
            val_losses += criterion(output['out'], val_batch.label) * val_batch.batch_size
            val_accs += accuracy(output['out'], val_batch.label) * val_batch.batch_size
            val_lengths += val_batch.batch_size
           # print(val_batch.batch_size)
            
        
        # divide by the total accumulated batch sizes
        val_losses /= val_lengths
        val_accs /= val_lengths
        
        print("valid, it: {} loss: {:.2f} accs: {:.2f}\n".format(i, get_numpy(val_losses), get_numpy(val_accs)))
        #update_plot(val_meta, 'bow', tsne_plot)
        
        net.train()
    
    output = net(batch.premise,batch.hypothesis)
    batch_loss = criterion(output['out'], batch.label)
    
    train_loss.append(get_numpy(batch_loss))
    train_accs.append(get_numpy(accuracy(output['out'], batch.label)))
    
    optimizer.zero_grad()
    batch_loss.backward()
    optimizer.step()
    
    if i % log_every == 0:        
        print("train, it: {} loss: {:.2f} accs: {:.2f}".format(i, 
                                                               np.mean(train_loss), 
                                                               np.mean(train_accs)))
        # reset
        train_loss, train_accs = [], []
        
    if max_iter < i:
        break