### Preprocess esnli data from .csv files to batches of tensors ready to feed into models

In [1]:
import torch
import os
import sys
sys.path.append('./../../src')
import random
import numpy as np

In [2]:
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
random.seed()
np.random.seed(0)

In [3]:
train_data_path = '/data/rosa/data/esnli/esnli_train.csv'
dev_data_path = './sanity-checks/esnli_dev.csv'
cached_train_features_file = './cache/cached_train_esnli'
save_trained_model_dir = './esnli_train_trained_model/'

### Get Examples 

In [4]:
from transformers import DataProcessor, InputExample
import csv

In [5]:
class EsnliProcessor(DataProcessor):

    def get_train_examples(self, data_path):
        """See base class."""
        examples = []
        with open(data_path, newline='') as f:
            reader = csv.reader(f)
            for (i, line) in enumerate(reader):
                if i == 0:
                    continue
                guid = "%s-%s" % ("train", i)
                label = line[1]
                premise = line[2]
                hypothesis = line[3]
                text_a = premise + " [SEP] " + hypothesis # p + [SEP] + h
                text_b = line[4] # expl
                assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
                examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
    
    def get_dev_examples(self, data_path):
        examples = []
        with open(data_path, newline='') as f:
            reader = csv.reader(f)
            for (i, line) in enumerate(reader):
                if i == 0:
                    continue
                guid = "%s-%s" % ("dev", i)
                label = line[1]
                premise = line[2]
                hypothesis = line[3]
                text_a = premise + " [SEP] " + hypothesis # p + [SEP] + h
                text_b = line[4] # expl 1
                text_c = line[9] # expl 2
                text_d = line[14] # expl 3
                assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) \
                and isinstance(text_c, str) and isinstance(text_d, str)
                examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, \
                                             label=label, text_c=text_c, text_d=text_d))
        return examples

In [6]:
processor = EsnliProcessor()

In [7]:
# #using a smaller dataset so faster to test my code
# train_examples = processor.get_train_examples(train_data_path) 

In [8]:
dev_examples = processor.get_dev_examples(dev_data_path) 

### Convert Examples to Features

In [9]:
import logging as logger
from transformers import BertTokenizer

In [10]:
# attention mask: avoid performing attention on padding token indices
# padding and truncation to max length
max_seq_len = 128

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [12]:
def esnli_examples_to_features(examples, max_seq_len, tokenizer, cls_token='[CLS]', sep_token='[SEP]', 
                               pad_token=0, mask_padding_with_zero=True):
    """
        Does not support token_type_id, because the EncoderDecoderModel does not. Therefore, the premise
        and hypothesis is separated by a [SEP], but no token_type_id is there to tell this difference.
    """
    
    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 1000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        input_ids, input_mask = text_to_input_ids(example.text_a, max_seq_len, tokenizer)
        decoder_input_ids, dummy = text_to_input_ids(example.text_b, max_seq_len, tokenizer)
        assert len(input_ids) == max_seq_len
        assert len(input_mask) == max_seq_len
        assert len(decoder_input_ids) == max_seq_len
        
        expl2_ids = None
        expl3_ids = None
        if example.text_c != None and example.text_d != None:
            expl2_ids, dummy = text_to_input_ids(example.text_c, max_seq_len, tokenizer)
            expl3_ids, dummy = text_to_input_ids(example.text_d, max_seq_len, tokenizer)
            assert len(expl2_ids) == max_seq_len
            assert len(expl3_ids) == max_seq_len
        

        features.append(EsnliInputFeatures(input_ids=input_ids,
                                          attention_mask=input_mask,
                                          decoder_input_ids=decoder_input_ids,
                                          expl2_ids=expl2_ids,
                                          expl3_ids=expl3_ids))
    return features

In [13]:
def text_to_input_ids(text, max_seq_len, tokenizer, cls_token='[CLS]', sep_token='[SEP]', 
                      pad_token=0, mask_padding_with_zero=True):
    tokens = tokenizer.tokenize(text)
    
    # truncate to max_length - 2 if needed, 
    # the -2 accounts for cls_token and sep_token that are going to be added.
    tokens = tokens[:max_seq_len-2]
    
    tokens = [cls_token] + tokens + [sep_token]

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_seq_len - len(input_ids)
    input_ids = input_ids + ([pad_token] * padding_length)
    input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
    
    return input_ids, input_mask

In [14]:
class EsnliInputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, attention_mask, decoder_input_ids, expl2_ids, expl3_ids):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.decoder_input_ids = decoder_input_ids
        self.labels = decoder_input_ids #expl1
        self.expl2 = expl2_ids #expl2
        self.expl3 = expl3_ids #expl3

In [15]:
# # Cache training dataset features
# if os.path.exists(cached_train_features_file):
#     logger.info("Loading features from cached file %s", cached_train_features_file)
#     train_features = torch.load(cached_train_features_file)
# else:
#     train_features = esnli_examples_to_features(train_examples, max_seq_len, tokenizer)
#     logger.info("Saving training features into cached file %s", cached_train_features_file)
#     torch.save(train_features, cached_train_features_file)

In [16]:
dev_features = esnli_examples_to_features(dev_examples, max_seq_len, tokenizer)

### Training

In [68]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [69]:
%reload_ext autoreload

In [70]:
%autoreload 2

In [71]:
from transformers import EncoderDecoderModel
from transformers import Trainer, TrainingArguments

In [58]:
# #initialize Bert2Bert
# model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') 

In [59]:
# training_args = TrainingArguments(
#     output_dir='./checkpoint-train-results',          # output directory
#     num_train_epochs=3,              # total # of training epochs
#     per_device_train_batch_size=4,  # batch size per device during training
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir='./train-logs',            # directory for storing logs
#     do_train=True,
#     logging_steps=5000,
#     save_steps=5000,
#     overwrite_output_dir=True,
#     warmup_steps=1000,                # number of warmup steps for learning rate scheduler
# )

# trainer = Trainer(
#     model=model,                         # the instantiated 🤗 Transformers model to be trained
#     args=training_args,                  # training arguments, defined above
#     train_dataset=train_features,         # training dataset
#     eval_dataset=train_features            # evaluation dataset
# )

In [60]:
# # Call trainer.train() to train.
# # The first argument returned from forward must be the loss which you wish to optimize.
# trainer.train()

### Save Model After Training

In [61]:
output_dir = save_trained_model_dir
cuda_id = "1" # since there's something running on 0

In [62]:
# trainer.save_model(output_dir)

### Evaluation

In [63]:
# Load a trained model and vocabulary that you have fine-tuned
model = EncoderDecoderModel.from_pretrained(output_dir)
device = torch.device("cuda:"+cuda_id)
model.to(device)

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [64]:
model.config.max_length=128
model.config.decoder_start_token_id = 101
model.config.eos_token_id = 102

In [65]:
eval_args = TrainingArguments(
    output_dir='./checkpoint-eval-results',          # output directory
    per_device_eval_batch_size=1,   # batch size for evaluation
    do_eval = True,
    predict_from_generate=True,
)

In [66]:
evaluator = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=eval_args,                  # eval arguments, defined above
    eval_dataset=dev_features,            # evaluation dataset
)

In [81]:
%autoreload 2
from transformers import Trainer, TrainingArguments

In [82]:
# Evaluate
evaluator.eval_esnli_write_output()

***** Running %s ***** Evaluation
  Num examples = %d 9842
  Batch size = %d 1


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=9842.0, style=ProgressStyle(description_…




### Compute bleu scores based on csv

In [31]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from statistics import mean

In [32]:
def get_bleu_score(pred_gold_csv):
    """
    Compute bleu score based on the predicted explanations and the gold explanations in the csv file.

    Input: csv file
    Output: bleu score
    """
    bleu_scores = []
    bleu_scores_12 = []
    bleu_scores_13 = []
    bleu_scores_expl1 = []
    bleu_scores_expl2 = []
    bleu_scores_expl3 = []
    bleu_scores_expl3_123 = []
    with open(pred_gold_csv, newline='') as f:
        reader = csv.reader(f)
        for (i, line) in enumerate(reader):
            if i == 0:
                continue
            pred_expl = eval(line[0])
            gold_expl_1 = eval(line[1])
            gold_expl_2 = eval(line[2])
            gold_expl_3 = eval(line[3])
            # process the explanations before passing to compute bleu scores: 
            # get rid of the CLS, SEP, can PAD tokens - tokens with id 101, 102, and 0
            pred_expl = remove_special_tokens(pred_expl)
            gold_expl_1 = remove_special_tokens(gold_expl_1)
            gold_expl_2 = remove_special_tokens(gold_expl_2)
            gold_expl_3 = remove_special_tokens(gold_expl_3)
            if i < 10:
                print(i)
                print('pred_expl: ', tokenizer.decode(pred_expl))
                print('gold_expl_1: ', tokenizer.decode(gold_expl_1))
                print('gold_expl_2: ', tokenizer.decode(gold_expl_2))
                print('gold_expl_3: ', tokenizer.decode(gold_expl_3))
            
            bleu_scores.append(corpus_bleu([[gold_expl_1, gold_expl_2, gold_expl_3]], [pred_expl]))
            bleu_scores_12.append(corpus_bleu([[gold_expl_1, gold_expl_2]], [pred_expl]))
            bleu_scores_13.append(corpus_bleu([[gold_expl_1, gold_expl_3]], [pred_expl]))
            bleu_scores_expl1.append(corpus_bleu([[gold_expl_2, gold_expl_3]], [gold_expl_1]))
            bleu_scores_expl2.append(corpus_bleu([[gold_expl_1, gold_expl_3]], [gold_expl_2]))
            bleu_scores_expl3.append(corpus_bleu([[gold_expl_1, gold_expl_2]], [gold_expl_3]))
    print('mean(bleu_scores): ', mean(bleu_scores))
    print('mean(bleu_scores_12): ', mean(bleu_scores_12))
    print('mean(bleu_scores_13): ', mean(bleu_scores_13))
    print('mean(bleu_scores_expl1): ', mean(bleu_scores_expl1))
    print('mean(bleu_scores_expl2): ', mean(bleu_scores_expl2))
    print('mean(bleu_scores_expl3): ', mean(bleu_scores_expl3))
    return mean(bleu_scores)

In [33]:
def remove_special_tokens(token_list):    
    sep_index = token_list.index(102) if 102 in token_list else -1
    cls_index = token_list.index(101) if 101 in token_list else -1
    
    result = []
    # remove [sep] and [pad]
    if sep_index == -1:
        result = token_list
    else:
        result = token_list[:sep_index]
    
    # remove [cls]
    if cls_index == -1:
        return result
    else:
        return result[1:]

In [83]:
get_bleu_score('./09:04_23:54:37.csv')

1
pred_expl:  two women embracing does not imply they are sisters.
gold_expl_1:  the to go packages may not be from lunch.
gold_expl_2:  just because two women are embracing, does not mean they are sisters. two women that are embracing are not necessarily hugging goodbye.
gold_expl_3:  two women do not have to be sisters. embracing does not mean hugging goodbye. the women do not have to have just finished lunch to be embracing.
2
pred_expl:  two women are holding packages is a rephrasing of two women are embracing while holding to go packages.
gold_expl_1:  saying the two women are holding packages is a way to paraphrase that the packages they are holding are to go packages.
gold_expl_2:  sentence 1 states that two women are holding to - go packages. to - go packages are a form of package.
gold_expl_3:  women can embrace while they are holding packages.
3
pred_expl:  women are not men.
gold_expl_1:  in the first sentence there is an action of affection between women while on the second

0.32525492389661653

### Get inter-annotator bleu score of gold expls 

In [35]:
# On dev
bleu_scores_expl1 = []
bleu_scores_expl2 = []
bleu_scores_expl3 = []
with open('./sanity-checks/esnli_dev.csv', newline='') as f:
    reader = csv.reader(f)
    for (i, line) in enumerate(reader):
        if i == 0:
            continue
        gold_expl_1 = tokenizer.tokenize(line[4])
        gold_expl_2 = tokenizer.tokenize(line[9])
        gold_expl_3 = tokenizer.tokenize(line[14])

        bleu_scores_expl1.append(corpus_bleu([[gold_expl_2, gold_expl_3]], [gold_expl_1]))
        bleu_scores_expl2.append(corpus_bleu([[gold_expl_1, gold_expl_3]], [gold_expl_2]))
        bleu_scores_expl3.append(corpus_bleu([[gold_expl_1, gold_expl_2]], [gold_expl_3]))

In [36]:
print(mean(bleu_scores_expl1))
print(mean(bleu_scores_expl2))
print(mean(bleu_scores_expl3))

0.15454015532820584
0.15744663375856507
0.15688393988822108


In [37]:
# On test
bleu_scores_expl1 = []
bleu_scores_expl2 = []
bleu_scores_expl3 = []
with open('/data/rosa/data/esnli/esnli_test.csv', newline='') as f:
    reader = csv.reader(f)
    for (i, line) in enumerate(reader):
        if i == 0:
            continue
        gold_expl_1 = tokenizer.tokenize(line[4])
        gold_expl_2 = tokenizer.tokenize(line[9])
        gold_expl_3 = tokenizer.tokenize(line[14])

        bleu_scores_expl1.append(corpus_bleu([[gold_expl_2, gold_expl_3]], [gold_expl_1]))
        bleu_scores_expl2.append(corpus_bleu([[gold_expl_1, gold_expl_3]], [gold_expl_2]))
        bleu_scores_expl3.append(corpus_bleu([[gold_expl_1, gold_expl_2]], [gold_expl_3]))

In [38]:
print(mean(bleu_scores_expl1))
print(mean(bleu_scores_expl2))
print(mean(bleu_scores_expl3))

0.15304998416295615
0.1528331719420083
0.15370095822582386
