In [1]:
import time
import pickle
import argparse
import torch
import torch.nn as nn

from torch.utils.data import DataLoader
from esim.data import NLIDataset
from esim.utils import correct_predictions, masked_softmax
from sklearn import metrics

In [134]:
class InterAttention(nn.Module):
    """
    Compute the inter-attention with softmax attention.
    """

    def forward(self,
                premise_batch,
                premise_mask,
                hypothesis_batch,
                hypothesis_mask):
        """
        Args:
            premise_batch: A batch of sequences of vectors representing the
                premises in some NLI task. The batch is assumed to have the
                size (batch, sequences, vector_dim).
            premise_mask: A mask for the sequences in the premise batch, to
                ignore padding data in the sequences during the computation of
                the attention.
            hypothesis_batch: A batch of sequences of vectors representing the
                hypotheses in some NLI task. The batch is assumed to have the
                size (batch, sequences, vector_dim).
            hypothesis_mask: A mask for the sequences in the hypotheses batch,
                to ignore padding data in the sequences during the computation
                of the attention.

        Returns:
            attended_premises: The sequences of attention vectors for the
                premises in the input batch.
            attended_hypotheses: The sequences of attention vectors for the
                hypotheses in the input batch.
        """
        # Dot product between premises and hypotheses in each sequence of
        # the batch.
        similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1).contiguous())

        # Softmax attention weights.
        prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask)
        hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask)

        return prem_hyp_attn, hyp_prem_attn

In [135]:
class IntraAttention(nn.Module):
    """
    Compute the intra-attention with adaptive attention.
    """
    def __init__(self, batch_size, hidden_size):
        super(IntraAttention, self).__init__()
        self.hidden_size2=hidden_size*2
        self.W = torch.nn.Parameter(torch.randn(batch_size, self.hidden_size2, self.hidden_size2))
        self.W.requires_grad = True
        self.b = torch.nn.Parameter(torch.randn(batch_size, self.hidden_size2, 1))
        self.b.requires_grad = True
        self.v = torch.nn.Parameter(torch.randn(batch_size, 1, self.hidden_size2))
        self.v.requires_grad = True
        self.batch_size = batch_size


    def forward(self,
                premise_batch):
        """
        Args:
            premise_batch: A batch of sequences of vectors representing the
                premises in some NLI task. The batch is assumed to have the
                size (batch, sequences, vector_dim).

        Returns:
            beta_batch: the intra attention
        """
        # premise_batch's size is [32(batch size), 25(words_num), 600(embedding size)]
        W_batch=self.W.repeat(self.batch_size,1,1)
        words_num=premise_batch.size()[1]
        b_batch=b.repeat(self.batch_size,1,words_num)
        v_batch=self.v.repeat(self.batch_size,1,1)
        premise_batch_t=torch.transpose(premise_batch,1,2).contiguous()
        f_batch=torch.bmm(v_batch, torch.tanh(torch.bmm(W_batch, premise_batch_t)+b_batch))
        alpha_batch=torch.softmax(f_batch, 2)
        alpha_batch_t=torch.transpose(alpha_batch,1,2).contiguous()
        c_batch=torch.bmm(premise_batch_t, alpha_batch_t)
        g_batch=torch.bmm(premise_batch, c_batch)
        beta_batch=torch.softmax(g_batch,1)
        
        return beta_batch

In [136]:
class AttentionCombination(nn.Module):
    """
    Attention layer taking premises and hypotheses encoded by an RNN as input
    and computing the soft attention between their elements.

    The dot product of the encoded vectors in the premises and hypotheses is
    first computed. The softmax of the result is then used in a weighted sum
    of the vectors of the premises for each element of the hypotheses, and
    conversely for the elements of the premises.
    """

    def forward(self,
                premise_batch,
                premise_mask,
                hypothesis_batch,
                hypothesis_mask,
                prem_hyp_attn,
                hyp_prem_attn,
                beta_batch):
        """
        Args:
            premise_batch: A batch of sequences of vectors representing the
                premises in some NLI task. The batch is assumed to have the
                size (batch, sequences, vector_dim).
            premise_mask: A mask for the sequences in the premise batch, to
                ignore padding data in the sequences during the computation of
                the attention.
            hypothesis_batch: A batch of sequences of vectors representing the
                hypotheses in some NLI task. The batch is assumed to have the
                size (batch, sequences, vector_dim).
            hypothesis_mask: A mask for the sequences in the hypotheses batch,
                to ignore padding data in the sequences during the computation
                of the attention.

        Returns:
            attended_premises: The sequences of attention vectors for the
                premises in the input batch.
            attended_hypotheses: The sequences of attention vectors for the
                hypotheses in the input batch.
        """
        # Weighted sums of the hypotheses for the the premises attention,
        # and vice-versa for the attention of the hypotheses.
        attended_premises = weighted_sum(hypothesis_batch,
                                         prem_hyp_attn,
                                         premise_mask)
        attended_hypotheses = weighted_sum(premise_batch,
                                           hyp_prem_attn,
                                           hypothesis_mask)

        return attended_premises, attended_hypotheses

In [139]:
def get_non_zeros(hypotheses):
    shape=hypotheses.size()
    count_list=[]
    for i in range(shape[0]):
        count=0
        for j in range(shape[1]):
            if hypotheses[i][j].item()!=0:
                count+=1
        count_list.append(count)
    return count_list

In [140]:
def get_non_zeros_2D(hypotheses):
    count_list=[]
    for row in hypotheses:
        count=0
        for j in row:
            if j!=0:
                count+=1
        count_list.append(count)
    return count_list

In [141]:
def get_non_zeros_3D(encoded_premises):
    shape=encoded_premises.size()
    print(shape)
    count_list1=[]
    for i in range(shape[0]):
        count_list2=[]
        for j in range(shape[1]):
            count=0
            for k in range(shape[2]):
                if encoded_premises[i][j][k].item()!=0:
                    count+=1
            count_list2.append(count)
        count_list1.append(count_list2)
    return count_list1

In [142]:
"""
Definition of the ESIM model.
"""
import torch
import torch.nn as nn

from esim.layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention
from esim.utils import get_mask, replace_masked


class ESIM(nn.Module):
    """
    Implementation of the ESIM model presented in the paper "Enhanced LSTM for
    Natural Language Inference" by Chen et al.
    """

    def __init__(self,
                 vocab_size,
#                  batch_size,
                 embedding_dim,
                 hidden_size,
                 embeddings=None,
                 padding_idx=0,
                 dropout=0.5,
                 num_classes=2,
                 device="cpu"):
        """
        Args:
            vocab_size: The size of the vocabulary of embeddings in the model.
            embedding_dim: The dimension of the word embeddings.
            hidden_size: The size of all the hidden layers in the network.
            embeddings: A tensor of size (vocab_size, embedding_dim) containing
                pretrained word embeddings. If None, word embeddings are
                initialised randomly. Defaults to None.
            padding_idx: The index of the padding token in the premises and
                hypotheses passed as input to the model. Defaults to 0.
            dropout: The dropout rate to use between the layers of the network.
                A dropout rate of 0 corresponds to using no dropout at all.
                Defaults to 0.5.
            num_classes: The number of classes in the output of the network.
                Defaults to 3.
            device: The name of the device on which the model is being
                executed. Defaults to 'cpu'.
        """
        super(ESIM, self).__init__()

        self.vocab_size = vocab_size
#         self.batch_size = batch_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.dropout = dropout
        self.device = device
        print("vocab_size:",vocab_size)
        print("embedding_dim:",embedding_dim)
        print("hidden_size:",hidden_size)
        print("num_classes:",num_classes)
        print("dropout:",dropout)
        print("device:",device)

        self._word_embedding = nn.Embedding(self.vocab_size,
                                            self.embedding_dim,
                                            padding_idx=padding_idx,
                                            _weight=embeddings)

        if self.dropout:
            self._rnn_dropout = RNNDropout(p=self.dropout)
            # self._rnn_dropout = nn.Dropout(p=self.dropout)

        self._encoding = Seq2SeqEncoder(nn.LSTM,
                                        self.embedding_dim,
                                        self.hidden_size,
                                        bidirectional=True)

        self._attention = SoftmaxAttention()
        self._interattention = InterAttention()
#         self._intraattention = IntraAttention(self.batch_size, self.embedding_dim)

        self._projection = nn.Sequential(nn.Linear(4*2*self.hidden_size,
                                                   self.hidden_size),
                                         nn.ReLU())

        self._composition = Seq2SeqEncoder(nn.LSTM,
                                           self.hidden_size,
                                           self.hidden_size,
                                           bidirectional=True)

        self._classification = nn.Sequential(nn.Dropout(p=self.dropout),
                                             nn.Linear(2*4*self.hidden_size,
                                                       self.hidden_size),
                                             nn.Tanh(),
                                             nn.Dropout(p=self.dropout),
                                             nn.Linear(self.hidden_size,
                                                       self.num_classes))

        # Initialize all weights and biases in the model.
        self.apply(_init_esim_weights)

    def forward(self,
                premises,
                premises_lengths,
                hypotheses,
                hypotheses_lengths):
        """
        Args:
            premises: A batch of varaible length sequences of word indices
                representing premises. The batch is assumed to be of size
                (batch, premises_length).
            premises_lengths: A 1D tensor containing the lengths of the
                premises in 'premises'.
            hypothesis: A batch of varaible length sequences of word indices
                representing hypotheses. The batch is assumed to be of size
                (batch, hypotheses_length).
            hypotheses_lengths: A 1D tensor containing the lengths of the
                hypotheses in 'hypotheses'.

        Returns:
            logits: A tensor of size (batch, num_classes) containing the
                logits for each output class of the model.
            probabilities: A tensor of size (batch, num_classes) containing
                the probabilities of each output class in the model.
        """
        print("premises:",premises,premises.size(),premises[0])
        print("premises_lengths:",premises_lengths)
        premises_mask = get_mask(premises, premises_lengths).to(self.device)
        print("premises_mask:",premises_mask,premises_mask.size(),premises_mask[0],sum(premises_mask[0]))
        hypotheses_mask = get_mask(hypotheses, hypotheses_lengths).to(self.device)

        embedded_premises = self._word_embedding(premises)
        print("embedded_premises:",embedded_premises,embedded_premises.size())
        embedded_premises_3D=get_non_zeros_3D(embedded_premises)
        print("embedded_premises_3D:",embedded_premises_3D)
        embedded_premises_2D=get_non_zeros_2D(embedded_premises_3D)
        print("embedded_premises_2D:",embedded_premises_2D)
        embedded_hypotheses = self._word_embedding(hypotheses)
        
        if self.dropout:
            embedded_premises = self._rnn_dropout(embedded_premises)
            embedded_hypotheses = self._rnn_dropout(embedded_hypotheses)

        embedded_premises_dropout_3D=get_non_zeros_3D(embedded_premises)
        print("embedded_premises_dropout_3D:",embedded_premises_dropout_3D)
        embedded_premises_dropout_2D=get_non_zeros_2D(embedded_premises_dropout_3D)
        print("embedded_premises_dropout_2D:",embedded_premises_dropout_2D)
        encoded_premises = self._encoding(embedded_premises,
                                          premises_lengths)
        print("encoded_premises:",encoded_premises,encoded_premises.size())
        encoded_premises_3D=get_non_zeros_3D(encoded_premises)
        print("encoded_premises_3D:",encoded_premises_3D)
        encoded_premises_2D=get_non_zeros_2D(encoded_premises_3D)
        print("encoded_premises_2D:",encoded_premises_2D)
        encoded_hypotheses = self._encoding(embedded_hypotheses,
                                            hypotheses_lengths)
        print("encoded_hypotheses:",encoded_hypotheses,encoded_hypotheses.size())

        prem_hyp_attn, hyp_prem_attn = self._interattention(encoded_premises, premises_mask,
                            encoded_hypotheses, hypotheses_mask)
        print("prem_hyp_attn:",prem_hyp_attn,prem_hyp_attn.size())
        print("hyp_prem_attn:",hyp_prem_attn,hyp_prem_attn.size())
#         beta_premises=self._intraattention(encoded_premises)
#         beta_hypotheses=self._intraattention(encoded_hypotheses)
        attended_premises, attended_hypotheses =\
            self._attention(encoded_premises, premises_mask,
                            encoded_hypotheses, hypotheses_mask)
        print("attended_premises:",attended_premises,attended_premises.size())

        enhanced_premises = torch.cat([encoded_premises,
                                       attended_premises,
                                       encoded_premises - attended_premises,
                                       encoded_premises * attended_premises],
                                      dim=-1)
        enhanced_hypotheses = torch.cat([encoded_hypotheses,
                                         attended_hypotheses,
                                         encoded_hypotheses - attended_hypotheses,
                                         encoded_hypotheses * attended_hypotheses],
                                        dim=-1)

        projected_premises = self._projection(enhanced_premises)
        projected_hypotheses = self._projection(enhanced_hypotheses)

        if self.dropout:
            projected_premises = self._rnn_dropout(projected_premises)
            projected_hypotheses = self._rnn_dropout(projected_hypotheses)

        v_ai = self._composition(projected_premises, premises_lengths)
        v_bj = self._composition(projected_hypotheses, hypotheses_lengths)

        v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1)
                                                .transpose(2, 1), dim=1)\
            / torch.sum(premises_mask, dim=1, keepdim=True)
        v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1)
                                                  .transpose(2, 1), dim=1)\
            / torch.sum(hypotheses_mask, dim=1, keepdim=True)

        v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1)
        v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1)

        v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1)

        logits = self._classification(v)
        probabilities = nn.functional.softmax(logits, dim=-1)

        return logits, probabilities


def _init_esim_weights(module):
    """
    Initialise the weights of the ESIM model.
    """
    if isinstance(module, nn.Linear):
        nn.init.xavier_uniform_(module.weight.data)
        nn.init.constant_(module.bias.data, 0.0)

    elif isinstance(module, nn.LSTM):
        nn.init.xavier_uniform_(module.weight_ih_l0.data)
        nn.init.orthogonal_(module.weight_hh_l0.data)
        nn.init.constant_(module.bias_ih_l0.data, 0.0)
        nn.init.constant_(module.bias_hh_l0.data, 0.0)
        hidden_size = module.bias_hh_l0.data.shape[0] // 4
        module.bias_hh_l0.data[hidden_size:(2*hidden_size)] = 1.0

        if (module.bidirectional):
            nn.init.xavier_uniform_(module.weight_ih_l0_reverse.data)
            nn.init.orthogonal_(module.weight_hh_l0_reverse.data)
            nn.init.constant_(module.bias_ih_l0_reverse.data, 0.0)
            nn.init.constant_(module.bias_hh_l0_reverse.data, 0.0)
            module.bias_hh_l0_reverse.data[hidden_size:(2*hidden_size)] = 1.0

In [143]:
def test(model, dataloader):
    """
    Test the accuracy of a model on some labelled test dataset.

    Args:
        model: The torch module on which testing must be performed.
        dataloader: A DataLoader object to iterate over some dataset.

    Returns:
        batch_time: The average time to predict the classes of a batch.
        total_time: The total time to process the whole dataset.
        accuracy: The accuracy of the model on the input data.
    """
    # Switch the model to eval mode.
    model.eval()
    device = model.device

    time_start = time.time()
    batch_time = 0.0
    accuracy = 0.0

    all_labels = []
    all_out_classes = []

    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for batch in dataloader:
            batch_start = time.time()

            # Move input and output data to the GPU if one is used.
            premises = batch["premise"].to(device)
            premises_lengths = batch["premise_length"].to(device)
            hypotheses = batch["hypothesis"].to(device)
            hypotheses_lengths = batch["hypothesis_length"].to(device)
            labels = batch["label"]
            all_labels.extend(labels.tolist())
            labels = labels.to(device)

            _, probs = model(premises,
                             premises_lengths,
                             hypotheses,
                             hypotheses_lengths)
            _, out_classes = probs.max(dim=1)
            all_out_classes.extend(out_classes.tolist())

            accuracy += correct_predictions(probs, labels)
            batch_time += time.time() - batch_start
            break

In [144]:
def main(test_file, pretrained_file, batch_size=32):
    """
    Test the ESIM model with pretrained weights on some dataset.

    Args:
        test_file: The path to a file containing preprocessed NLI data.
        pretrained_file: The path to a checkpoint produced by the
            'train_model' script.
        vocab_size: The number of words in the vocabulary of the model
            being tested.
        embedding_dim: The size of the embeddings in the model.
        hidden_size: The size of the hidden layers in the model. Must match
            the size used during training. Defaults to 300.
        num_classes: The number of classes in the output of the model. Must
            match the value used during training. Defaults to 3.
        batch_size: The size of the batches used for testing. Defaults to 32.
    """
    print(20 * "=", " Preparing for testing ", 20 * "=")

    #gpu
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    checkpoint = torch.load(pretrained_file)
    
    #cpu
#     device = torch.device("cpu")
#     checkpoint = torch.load(pretrained_file, map_location=lambda storage, loc: storage)

    # Retrieving model parameters from checkpoint.
    vocab_size = checkpoint["model"]["_word_embedding.weight"].size(0)
    embedding_dim = checkpoint["model"]['_word_embedding.weight'].size(1)
    hidden_size = checkpoint["model"]["_projection.0.weight"].size(0)
    num_classes = checkpoint["model"]["_classification.4.weight"].size(0)

    print("\t* Loading test data...")
    with open(test_file, "rb") as pkl:
        test_data = NLIDataset(pickle.load(pkl))

    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    print("\t* Building model...")
    model = ESIM(vocab_size,
#                  batch_size,
                 embedding_dim,
                 hidden_size,
                 num_classes=num_classes,
                 device=device).to(device)

    model.load_state_dict(checkpoint["model"])

    print(20 * "=",
          " Testing ESIM model on device: {} ".format(device),
          20 * "=")
    test(model, test_loader)

In [145]:
test_data="/home/rongz/paraphrase_recognition/src/data/preprocessed/quora/test_data.pkl"
checkpoint="/home/rongz/paraphrase_recognition/exp/saved_model/ESIM/qqp/best.pth.tar"
batch_size=32
main(test_data,
     checkpoint,
     batch_size)

	* Loading test data...
	* Building model...
vocab_size: 128447
embedding_dim: 300
hidden_size: 300
num_classes: 2
dropout: 0.5
device: cuda:0
premises: tensor([[  2,   6,  43,  ...,   0,   0,   0],
        [  2,  76, 206,  ...,   0,   0,   0],
        [  2,  11,  20,  ...,   0,   0,   0],
        ...,
        [  2,  11, 162,  ...,   0,   0,   0],
        [  2,   6,   8,  ...,   0,   0,   0],
        [  2,   6,   8,  ...,   0,   0,   0]], device='cuda:0') torch.Size([32, 71]) tensor([   2,    6,   43,    7,   14,   10,  689, 1534,   12,  394,    4,    3,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       device='cuda:0')
premises_lengths: te

  sequences_lengths.new_tensor(torch.arange(0, len(sequences_lengths)))


encoded_premises_3D: [[600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [600, 600, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [600, 600, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0], [600, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 600, 0, 0, 0, 0, 0, 0, 0, 0,

In [20]:
a = torch.rand(3,2,2)
b = torch.rand(3,2)

In [21]:
c = b.unsqueeze(-1)

In [25]:
a * c

tensor([[[0.0071, 0.0527],
         [0.2866, 0.1333]],

        [[0.4576, 0.5175],
         [0.0553, 0.0382]],

        [[0.3357, 0.4629],
         [0.1772, 0.4650]]])

In [23]:
a

tensor([[[0.0670, 0.4984],
         [0.9579, 0.4455]],

        [[0.8613, 0.9739],
         [0.7200, 0.4967]],

        [[0.5814, 0.8015],
         [0.2819, 0.7397]]])

In [24]:
b

tensor([[0.1057, 0.2992],
        [0.5314, 0.0768],
        [0.5775, 0.6286]])