### Preprocess esnli data from .csv files to batches of tensors ready to feed into models

In [None]:
import torch
import os
import sys
sys.path.append('./../../src')
import random
import numpy as np

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
random.seed()
np.random.seed(0)

### Get Examples 

In [None]:
from transformers import DataProcessor, InputExample
import csv

In [None]:
class EsnliProcessor(DataProcessor):

    def get_train_examples(self, data_path):
        """See base class."""
        examples = []
        with open(data_path, newline='') as f:
            reader = csv.reader(f)
            for (i, line) in enumerate(reader):
                if i == 0:
                    continue
                guid = "%s-%s" % ("train", i)
                label = line[1]
                premise = line[2]
                hypothesis = line[3]
                text_a = premise + " [SEP] " + hypothesis # p + [SEP] + h
                text_b = line[4] # expl
                assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
                examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
    
    def get_dev_examples(self, data_path):
        examples = []
        with open(data_path, newline='') as f:
            reader = csv.reader(f)
            for (i, line) in enumerate(reader):
                if i == 0:
                    continue
                guid = "%s-%s" % ("dev", i)
                label = line[1]
                premise = line[2]
                hypothesis = line[3]
                text_a = premise + " [SEP] " + hypothesis # p + [SEP] + h
                text_b = line[4] # expl 1
                text_c = line[9] # expl 2
                text_d = line[14] # expl 3
                assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) \
                and isinstance(text_c, str) and isinstance(text_d, str)
                examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, \
                                             label=label, text_c=text_c, text_d=text_d))
        return examples

In [None]:
processor = EsnliProcessor()

In [None]:
#using a smaller dataset so faster to test my code
train_examples = processor.get_train_examples('./esnli_dev.csv') 

In [None]:
dev_examples = processor.get_dev_examples('./esnli_dev_100.csv') 

In [None]:
# print(len(dev_examples))
# print(dev_examples[13])

### Convert Examples to Features

In [None]:
import logging as logger
from transformers import BertTokenizer

In [None]:
# attention mask: avoid performing attention on padding token indices
# padding and truncation to max length
max_seq_len = 128

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def esnli_examples_to_features(examples, max_seq_len, tokenizer, cls_token='[CLS]', sep_token='[SEP]', 
                               pad_token=0, mask_padding_with_zero=True):
    """
        Does not support token_type_id, because the EncoderDecoderModel does not. Therefore, the premise
        and hypothesis is separated by a [SEP], but no token_type_id is there to tell this difference.
    """
    
    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 1000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        input_ids, input_mask = text_to_input_ids(example.text_a, max_seq_len, tokenizer)
        decoder_input_ids, dummy = text_to_input_ids(example.text_b, max_seq_len, tokenizer)
    
        assert len(input_ids) == max_seq_len
        assert len(input_mask) == max_seq_len
        assert len(decoder_input_ids) == max_seq_len

        features.append(EsnliInputFeatures(input_ids=input_ids,
                                          attention_mask=input_mask,
                                          decoder_input_ids=decoder_input_ids))
    return features

In [None]:
def text_to_input_ids(text, max_seq_len, tokenizer, cls_token='[CLS]', sep_token='[SEP]', 
                      pad_token=0, mask_padding_with_zero=True):
    tokens = tokenizer.tokenize(text)
    
    # truncate to max_length - 2 if needed, 
    # the -2 accounts for cls_token and sep_token that are going to be added.
    tokens = tokens[:max_seq_len-2]
    
    tokens = [cls_token] + tokens + [sep_token]

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_seq_len - len(input_ids)
    input_ids = input_ids + ([pad_token] * padding_length)
    input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
    
    return input_ids, input_mask

In [None]:
class EsnliInputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, attention_mask, decoder_input_ids):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.decoder_input_ids = decoder_input_ids
        self.labels = decoder_input_ids #expl1

In [None]:
train_features = esnli_examples_to_features(train_examples, max_seq_len, tokenizer)

In [None]:
dev_features = esnli_examples_to_features(dev_examples, max_seq_len, tokenizer)

In [None]:
# print(dev_features[13].input_ids)
# print(dev_features[13].attention_mask)
# print(dev_features[13].decoder_input_ids)
# print(dev_features[13].labels)

### Training

In [None]:
#%load_ext autoreload

In [None]:
#%reload_ext autoreload

In [None]:
#%autoreload 2

In [None]:
from transformers import EncoderDecoderModel
from transformers import Trainer, TrainingArguments

In [None]:
# initialize Bert2Bert
# model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') 

In [None]:
# training_args = TrainingArguments(
#     output_dir='./results',          # output directory
#     num_train_epochs=3,              # total # of training epochs
#     per_device_train_batch_size=4,  # batch size per device during training
#     per_device_eval_batch_size=4,   # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir='./logs',            # directory for storing logs
# )

# trainer = Trainer(
#     model=model,                         # the instantiated 🤗 Transformers model to be trained
#     args=training_args,                  # training arguments, defined above
#     train_dataset=train_features,         # training dataset
#     eval_dataset=train_features            # evaluation dataset
# )

In [None]:
# call trainer.train() to train and trainer.evaluate(). 
# The first argument returned from forward must be the loss which you wish to optimize.

In [None]:
#trainer.train()

### Save & Load Model After Training

In [None]:
output_dir = "./trained_models/"
cuda_id = "1" # since there's something running on 0

In [None]:
#trainer.save_model(output_dir)

In [None]:
# # Load a trained model and vocabulary that you have fine-tuned
# model = EncoderDecoderModel.from_pretrained(output_dir)
# device = torch.device("cuda:"+cuda_id)
# model.to(device)

### Empty out GPU memory?

In [None]:
# import gc

In [None]:
# # maybe these lines should be in trainer's evaluate() function?
# gc.collect()
# torch.cuda.empty_cache()

### Evaluation

In [None]:
# Load a trained model and vocabulary that you have fine-tuned
model = EncoderDecoderModel.from_pretrained(output_dir)
device = torch.device("cuda:"+cuda_id)
model.to(device)

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from statistics import mean

In [None]:
eval_args = TrainingArguments(
    output_dir='./results',          # output directory
    per_device_eval_batch_size=2,   # batch size for evaluation
    logging_dir='./logs',            # directory for storing logs
)

In [None]:
evaluator = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=eval_args,                  # eval arguments, defined above
    eval_dataset=dev_features,            # evaluation dataset
)

In [None]:
# Evaluate
evaluator.eval_seq2seq_write_output()

### Compute bleu scores based on csv

In [None]:
import torch.tensor as tensor

In [None]:
def get_bleu_score(pred_gold_csv):
    """
    Compute bleu score based on the predicted explanations and the gold explanations in the csv file.

    Input: csv file
    Output: bleu score
    """
    bleu_scores = []
    with open(pred_gold_csv, newline='') as f:
        reader = csv.reader(f)
        for (i, line) in enumerate(reader):
            if i == 0:
                continue
            if i % 1000 == 0:
                print(i)
                print('pred_expl: ', tokenizer.decode(pred_expl))
                print('gold_expl_1: ', tokenizer.decode(gold_expl_1))
            pred_expl = eval(line[0])
            gold_expl_1 = eval(line[1])
            # process the explanations before passing to compute bleu scores: 
            # get rid of the CLS, SEP, can PAD tokens - tokens with id 101, 102, and 0
            pred_expl = remove_special_tokens(pred_expl)
            gold_expl_1 = remove_special_tokens(gold_expl_1)
            
            bleu_scores.append(sentence_bleu([gold_expl_1], pred_expl))
    return mean(bleu_scores)

In [None]:
def remove_special_tokens(token_list):    
    sep_index = token_list.index(102) if 102 in token_list else -1
    cls_index = token_list.index(101) if 101 in token_list else -1
    
    result = []
    # remove [sep] and [pad]
    if sep_index == -1:
        result = token_list
    else:
        result = token_list[:sep_index]
    
    # remove [cls]
    if cls_index == -1:
        return result
    else:
        return result[1:]

In [None]:
get_bleu_score('./esnli_dev_model_expl_output.csv')

In [None]:
from nltk.translate.bleu_score import corpus_bleu

In [None]:
hypothesis = ['This', 'is', 'a', 'cat'] 
references = [hypothesis] 

In [None]:
sentence_bleu(references, hypothesis)

In [None]:
references = [[['this', 'is', 'a', 'test'], ['this', 'test']]]
candidates = [['this', 'is', 'a', 'test']]
score = corpus_bleu(references, candidates)
print(score)