# Learning sentence representations from NLI data


In [146]:
from  torchtext.legacy import data
import torch
from torchtext.legacy.datasets import SNLI
from torchtext.vocab import GloVe
from models import NLITrainer
import utils
from scipy.spatial import distance
from nltk.tokenize import word_tokenize

CHECKPOINT_PATH = "./checkpoints"

## 1 Preparations
### 1.1 Load SNLI Data

In [147]:
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True, 
                        tokenize="spacy",
                        tokenizer_language = 'en_core_web_sm'
                        )
LABEL = data.Field(sequential=False)


 # make splits for data
train, val, test = SNLI.splits(TEXT, LABEL)

train_iter, val_iter, test_iter = data.Iterator.splits(
                                        (train, val, test), 
                                        batch_size=4, 
                                        device ="cpu")
# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name= '840B', dim= 300))
LABEL.build_vocab(train, specials_first=False)

In [148]:
label_dict ={0:"entailment", 1:"contraction", 2:"neutral"} 
len(val.examples)

9842

### 1.2 Load Pretrained Model

In [149]:
BLSTM_Max_trainer = utils.load_latest(NLITrainer, CHECKPOINT_PATH, "BLSTM_Encoder_Max", 
                                inference=True, 
                                map_location="cpu", silent = False)
BLSTM_trainer = utils.load_latest(NLITrainer, CHECKPOINT_PATH, "BLSTM_Encoder", 
                                inference=True, 
                                map_location="cpu", silent = False)
LSTM_trainer = utils.load_latest(NLITrainer, CHECKPOINT_PATH, "LSTM_Encoder", 
                                inference=True, 
                                map_location="cpu", silent = False)
AWE_trainer = utils.load_latest(NLITrainer, CHECKPOINT_PATH, "AWE", 
                                inference=True, 
                                map_location="cpu", silent = False)

pretrained_filename:  ./checkpoints/BLSTM_Encoder_Max/lightning_logs/version_7594760/checkpoints/epoch=27-step=120175.ckpt
Found pretrained model at ./checkpoints/BLSTM_Encoder_Max/lightning_logs/version_7594760/checkpoints/epoch=27-step=120175.ckpt
pretrained_filename:  ./checkpoints/BLSTM_Encoder/lightning_logs/version_7596008/checkpoints/epoch=27-step=120175.ckpt
Found pretrained model at ./checkpoints/BLSTM_Encoder/lightning_logs/version_7596008/checkpoints/epoch=27-step=120175.ckpt
pretrained_filename:  ./checkpoints/LSTM_Encoder/lightning_logs/version_7594756/checkpoints/epoch=17-step=154511.ckpt
Found pretrained model at ./checkpoints/LSTM_Encoder/lightning_logs/version_7594756/checkpoints/epoch=17-step=154511.ckpt
pretrained_filename:  ./checkpoints/AWE/lightning_logs/version_7592794/checkpoints/epoch=16-step=145927.ckpt
Found pretrained model at ./checkpoints/AWE/lightning_logs/version_7592794/checkpoints/epoch=16-step=145927.ckpt


In [150]:
def predict(trainer, hypo, prem):
    out = trainer([prem, hypo])
    preds = out.argmax(dim=-1)
    preds = label_dict[preds.item()]
    return preds

In [151]:
def encode(trainer, text):
    with torch.no_grad():
        emb = trainer.encode(text).detach().cpu()
    return emb

zsh:1: command not found: tensorboard


## 2 Error Analysis
### 1. Examples in SNLI

In [152]:
# only BLSTM_max succeeds
i = 5000
print(val.examples[i].hypothesis)
print(val.examples[i].premise)
print(val.examples[i].label)

hypo, prem=TEXT.process([val.examples[i].hypothesis]), TEXT.process([val.examples[i].premise])
true_label = LABEL.process([val.examples[i].label])
print("AWE:", predict(AWE_trainer, hypo, prem))
print("LSTM:", predict(LSTM_trainer, hypo, prem))
print("BLSTM:", predict(BLSTM_trainer, hypo, prem))
print("BLSTM_Max:",predict(BLSTM_Max_trainer, hypo, prem))

['a', 'boy', 'enjoys', 'himself', 'at', 'the', 'pool', '.']
['a', 'boy', 'in', 'an', 'innertube', 'in', 'the', 'pool', 'splashing', 'and', 'smiling', '.']
entailment
AWE: contraction
LSTM: neutral
BLSTM: neutral
BLSTM_Max: entailment


### 2.2 Switching word orders
#### 2.2.1 Without meaning changes

In [153]:
# 
premise_1 = word_tokenize('a boy at the pool smiling and splashing.')
premise_2 = word_tokenize('a boy at the pool splashing and smiling.')


prem_1, prem_2 =TEXT.process([premise_1]),TEXT.process([premise_2])
print("BLSTM:", predict(BLSTM_trainer, hypo, prem_1))
print("BLSTM:", predict(BLSTM_trainer, hypo, prem_2))
print()
b_emb_1 = encode(BLSTM_trainer, prem_1)
b_emb_2 = encode(BLSTM_trainer, prem_2)
b_emb_0 = encode(BLSTM_trainer, prem)

bm_emb_1 = encode(BLSTM_Max_trainer, prem_1)
bm_emb_2 = encode(BLSTM_Max_trainer, prem_2)
bm_emb_0 = encode(BLSTM_Max_trainer, prem)

print("BLSTM")
print(b_emb_1[:,10:16], torch.max(b_emb_1))
print(b_emb_2[:,10:16], torch.max(b_emb_2))


print("\nBLSTM_max")
print(bm_emb_1[:,10:16], torch.max(bm_emb_1))
print(bm_emb_2[:,10:16], torch.max(bm_emb_2))

print("\nLSTM")
l_emb_1 = encode(LSTM_trainer, prem_1)
l_emb_2 = encode(LSTM_trainer, prem_2)
print(l_emb_1[:,10:16], torch.max(l_emb_1))
print(l_emb_2[:,10:16], torch.max(l_emb_2))

print("\nWhen meaning doesn't change, the embedding of BLSTM_max remains almost the same but BLSTM changes much")

BLSTM: neutral
BLSTM: entailment

BLSTM
tensor([[ 0.0183, -0.1460, -0.0072, -0.0148, -0.0192,  0.0217]]) tensor(0.6207)
tensor([[ 0.0208, -0.1000, -0.0021,  0.0066, -0.0103,  0.0046]]) tensor(0.6504)

BLSTM_max
tensor([[ 0.0703, -0.0250,  0.0285,  0.0112,  0.0003, -0.0023]]) tensor(0.3383)
tensor([[ 0.0703, -0.0250,  0.0285,  0.0112, -0.0040, -0.0023]]) tensor(0.3387)

LSTM
tensor([[ 0.0195,  0.0047, -0.0966, -0.0147, -0.0689,  0.0874]]) tensor(0.6014)
tensor([[-0.0114,  0.0068, -0.0945, -0.0634, -0.0743,  0.0779]]) tensor(0.6138)

When meaning doesn't change, the embedding of BLSTM_max remains almost the same but BLSTM changes much


#### 2.2.2 With meaning changes

In [176]:
prem_new = word_tokenize('a man pushes a lady to the ground.')
hypo_new = word_tokenize('a lady pushes a man')
hypo_new1 = word_tokenize('a man pushes a lady')
hypo_new, prem_new=TEXT.process([hypo_new]), TEXT.process([prem_new])
hypo_new1 = TEXT.process([hypo_new1])

print("AWE:", predict(AWE_trainer, hypo_new, prem_new))
print("LSTM:", predict(LSTM_trainer, hypo_new, prem_new))
print("BLSTM:", predict(BLSTM_trainer,hypo_new, prem_new))
print("BLSTM_Max:",predict(BLSTM_Max_trainer, hypo_new, prem_new))

print("\nBLSTM_Max:")
bm_emb_n = encode(BLSTM_Max_trainer, hypo_new)
bm_emb_n1 = encode(BLSTM_Max_trainer, hypo_new1)
print(bm_emb_n[:,10:16])
print(bm_emb_n1[:,10:16])
print("When meaning changes, embeddings are different")

print("\nBLSTM:")
b_emb_n = encode(BLSTM_trainer, hypo_new)
b_emb_n1 = encode(BLSTM_trainer, hypo_new1)
print(b_emb_n[:,10:16])
print(b_emb_n1[:,10:16])

print("\nLSTM:")
l_emb_n = encode(LSTM_trainer, hypo_new)
l_emb_n1 = encode(LSTM_trainer, hypo_new1)
print(l_emb_n[:,10:16])
print(l_emb_n1[:,10:16])

print("\nAWE:")
a_emb_n = encode(AWE_trainer, hypo_new)
a_emb_n1 = encode(AWE_trainer, hypo_new1)
print(a_emb_n[:,10:16])
print(a_emb_n1[:,10:16])

AWE: entailment
LSTM: entailment
BLSTM: entailment
BLSTM_Max: contraction

BLSTM_Max:
tensor([[ 0.0814, -0.0357,  0.0422,  0.0062,  0.0754, -0.0055]])
tensor([[ 0.1552, -0.0426,  0.0323,  0.0208,  0.0332,  0.0023]])
When meaning changes, embeddings are different

BLSTM:
tensor([[ 0.0441, -0.0313,  0.0261,  0.0487, -0.0014,  0.0725]])
tensor([[ 0.0071,  0.0512, -0.0310,  0.0464,  0.0748,  0.0174]])

LSTM:
tensor([[ 0.0226,  0.0263,  0.0042, -0.1209,  0.0206,  0.0431]])
tensor([[-0.0271,  0.0985, -0.0074, -0.1414,  0.0529,  0.0283]])

AWE:
tensor([[-0.1496, -0.0945, -0.1279, -0.0653, -0.1515,  0.0153]])
tensor([[-0.1496, -0.0945, -0.1279, -0.0653, -0.1515,  0.0153]])


### 2.3 Changing non-content words

In [163]:
premise_3 = word_tokenize('a boy at the pool smiling.')
premise_4 = word_tokenize('a boy in the pool smiling.')
prem_3 =TEXT.process([premise_3])
prem_4 =TEXT.process([premise_4])

print("AWE:",predict(AWE_trainer, hypo, prem_3))
print("AWE:",predict(AWE_trainer, hypo, prem_4))

print("LSTM:", predict(LSTM_trainer, hypo, prem_3))
print("LSTM:", predict(LSTM_trainer, hypo, prem_4))
print("BLSTM:", predict(BLSTM_trainer, hypo, prem_3))
print("BLSTM:", predict(BLSTM_trainer, hypo, prem_4))

#  AWE is sensitive to preposition&word change
awe_emb_3 = encode(AWE_trainer, prem_3)
awe_emb_4 = encode(AWE_trainer, prem_4)


AWE: entailment
AWE: contraction
LSTM: entailment
LSTM: neutral
BLSTM: neutral
BLSTM: neutral


In [161]:
premise_5 = word_tokenize('a boy outside the pool smiling.')

prem_5 =TEXT.process([premise_5])

print("AWE:",predict(AWE_trainer, hypo, prem_5))
print("LSTM:", predict(LSTM_trainer, hypo, prem_5))
print("BLSTM:", predict(BLSTM_trainer, hypo, prem_5))
print("BLSTM_Max:",predict(BLSTM_Max_trainer, hypo, prem_5))


# BLSTM_Max/classifier is not sensitive to preposition change
bm_emb_5 = encode(BLSTM_Max_trainer, prem_4)

print(bm_emb_3[:,:5])
print(bm_emb_4[:,:5])
print(bm_emb_5[:,:5])

AWE: contraction
LSTM: neutral
BLSTM: neutral
BLSTM_Max: entailment
tensor([[ 0.0100,  0.0118, -0.0108,  0.0758,  0.0107]])
tensor([[ 0.0059,  0.0118, -0.0350,  0.0996,  0.0418]])
tensor([[ 0.0100,  0.0118, -0.0141,  0.0884,  0.0429]])


### 2.4 Tense change

In [216]:
prem_new = word_tokenize('the girl is in a park')
hypo_new = word_tokenize('the girl was in a park ')

hypo_new, prem_new=TEXT.process([hypo_new]), TEXT.process([prem_new])
hypo_new1 = TEXT.process([hypo_new1])

print("AWE:", predict(AWE_trainer, hypo_new, prem_new))
print("LSTM:", predict(LSTM_trainer, hypo_new, prem_new))
print("BLSTM:", predict(BLSTM_trainer,hypo_new, prem_new))
print("BLSTM_Max:",predict(BLSTM_Max_trainer, hypo_new, prem_new))

print("\nBLSTM_Max:")
bm_emb_n = encode(BLSTM_Max_trainer, hypo_new)
bm_emb_n1 = encode(BLSTM_Max_trainer, prem_new )
print(bm_emb_n[:,10:16])
print(bm_emb_n1[:,10:16])

print("\nAWE:")
a_emb_n = encode(AWE_trainer, hypo_new)
a_emb_n1 = encode(AWE_trainer, prem_new )
print(a_emb_n[:,10:16])
print(a_emb_n1[:,10:16])

AWE: neutral
LSTM: entailment
BLSTM: entailment
BLSTM_Max: entailment

BLSTM_Max:
tensor([[-0.0273, -0.0025,  0.0203,  0.0073,  0.0148,  0.0065]])
tensor([[-0.0222,  0.0115,  0.0203,  0.0014,  0.0124,  0.0147]])
When meaning changes, embeddings are different

AWE:
tensor([[-0.3294, -0.0216,  0.0055, -0.0576, -0.1400,  0.0085]])
tensor([[-0.3646, -0.0683, -0.0871, -0.0401, -0.1054,  0.0184]])


In [231]:
prem_new = word_tokenize('the girl plays in a park')
hypo_new = word_tokenize('the girl played in a park ')

hypo_new, prem_new=TEXT.process([hypo_new]), TEXT.process([prem_new])
hypo_new1 = TEXT.process([hypo_new1])

print("AWE:", predict(AWE_trainer, hypo_new, prem_new))
print("LSTM:", predict(LSTM_trainer, hypo_new, prem_new))
print("BLSTM:", predict(BLSTM_trainer,hypo_new, prem_new))
print("BLSTM_Max:",predict(BLSTM_Max_trainer, hypo_new, prem_new))

print("\nBLSTM_Max:")
bm_emb_n = encode(BLSTM_Max_trainer, hypo_new)
bm_emb_n1 = encode(BLSTM_Max_trainer, prem_new )
print(bm_emb_n[:,10:16])
print(bm_emb_n1[:,10:16])

print("\nAWE:")
a_emb_n = encode(AWE_trainer, hypo_new)
a_emb_n1 = encode(AWE_trainer, prem_new )
print(a_emb_n[:,10:16])
print(a_emb_n1[:,10:16])

AWE: entailment
LSTM: entailment
BLSTM: entailment
BLSTM_Max: entailment

BLSTM_Max:
tensor([[-0.0495,  0.0070,  0.0203,  0.0018,  0.0700,  0.0789]])
tensor([[-0.0530,  0.0347,  0.0203,  0.0023,  0.0055,  0.0651]])

AWE:
tensor([[-0.2802, -0.0079, -0.0012, -0.0887, -0.0896,  0.0890]])
tensor([[-0.2994,  0.0042, -0.0472, -0.0425, -0.0973,  0.1327]])


### 2.5 Objective changes

In [225]:
# only BLSTM_max succeeds
i = 1000
print(val.examples[i].hypothesis)
print(val.examples[i].premise)
print(val.examples[i].label)

hypo, prem=TEXT.process([val.examples[i].hypothesis]), TEXT.process([val.examples[i].premise])
true_label = LABEL.process([val.examples[i].label])
print("AWE:", predict(AWE_trainer, hypo, prem))
print("LSTM:", predict(LSTM_trainer, hypo, prem))
print("BLSTM:", predict(BLSTM_trainer, hypo, prem))
print("BLSTM_Max:",predict(BLSTM_Max_trainer, hypo, prem))

['two', 'people', 'looking', 'at', 'new', 'york', '.']
['two', 'people', 'are', 'looking', 'at', 'something', 'in', 'new', 'york', 'city', '.']
neutral
AWE: neutral
LSTM: entailment
BLSTM: entailment
BLSTM_Max: entailment


In [226]:
print("\nBLSTM_Max:")
bm_emb_n = encode(BLSTM_Max_trainer, hypo)
bm_emb_n1 = encode(BLSTM_Max_trainer, prem)
print(bm_emb_n[:,10:16])
print(bm_emb_n1[:,10:16])
print("When meaning changes, embeddings are similar")


BLSTM_Max:
tensor([[ 0.1048,  0.0470,  0.0035,  0.0246, -0.1057,  0.2409]])
tensor([[ 0.1048,  0.0470,  0.0035,  0.0327, -0.0938,  0.2474]])
When meaning changes, embeddings are similar


## Summary:
1. BLSTM_Max Encoder outperforms the other encoders in most cases. By using maxpooling, it captures the most salient feature of sentences.This enables more robust sentence representations regardless of minor changes in sentences(e.g. replacing words, changing word orders). However, this can also make the classifier neglacte meaning changes caused by such changes.


2. AWE Encoder classifer is very sensitive to word changes and additional information, which makes it only succeed in the SNLI tasks when two sentences have many similar words. But such features enables it to performs better than BLSTM_Max Encoder classifier in some cases. When word orders change, the AWE embeddings remain the same.

3. LSTM Encoder and BLSTM Encoder have very similar performance over all tasks. They perform better than AWE encoder models but worse than the BLSTM ones. The encoders capture information changes when modifications are made to the sentences, but this doesn't enable the classifers to make good predictions in some cases. 

