In [22]:
# Import block
import json
import argparse
import time
import os
import pandas as pd
from datasets import Dataset
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments
from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
#import tensorflow_datasets as tfds

import torch
from torch import nn
from torch.nn import functional as F
from torch.autograd import Variable
from packaging import version

from datasets import list_datasets, load_dataset, list_metrics, load_metric
from datasets import Dataset

import sacrebleu
import datasets

In [23]:
# Utility functions from GP-VAE implementation

# Specific to dataset.
def construct_input_for_batch(tokenizer, batch, args):
    """
    Function that takes a batch from a dataset and constructs the corresponding 
    input string.
    """
    source, target = [], []
    for inp, out in zip(batch['source'], batch['target']):
        source.append(inp.strip())
        target.append(out.strip())
    if batch['id'][0] == 0:
        print(source[0])
        print(target[0])
        print()
    return source, target

def make_batch_inputs(batch, tokenizer, args, device='cuda:0'):
  """
  Function that takes a batch from a dataset and transforms it 
  """
  # Concatenate the concept names for each example in the batch.
  input_lists, _ = construct_input_for_batch(tokenizer, batch, args)
  # Use the model's tokenizer to create the batch input_ids.
  batch_features = tokenizer(input_lists, padding=True, return_tensors='pt')
  # Move all inputs to the device.
  batch_features = dict([(k, v.to(device)) for k, v in batch_features.items()])
  return batch_features

def make_batch_data(batch, tokenizer, args, device='cuda:0'):
  """
  Function that takes a batch from a dataset and transforms it 
  """
  # Concatenate the concept names for each example in the batch.
  input_lists, label_list = construct_input_for_batch(tokenizer, batch, args)
  # Use the model's tokenizer to create the batch input_ids.
  batch_features = tokenizer(input_lists, padding=True, return_tensors='pt')
  batch_labels = tokenizer(label_list, padding=True, return_tensors='pt')
  # Move all inputs to the device.
  batch_features = dict([(k, v.to(device)) for k, v in batch_features.items()])
  batch_labels = dict([(k, v.to(device)) for k, v in batch_labels.items()])
  return batch_features, batch_labels

def batch_tokenize(dataset_batch, tokenizer, args):
  """
  Reuse the function defined above to construct the batch (source, target) and 
  run them through the tokenizer.
  """
  source, target = construct_input_for_batch(tokenizer, dataset_batch, args)
  res = {
          "input_ids": tokenizer(
              source,
              padding='max_length', 
              truncation=True,
              max_length=args.encoder_max_length
          )["input_ids"],
          "labels": tokenizer(
              target,
              padding='max_length', 
              truncation=True,
              max_length=args.decoder_max_length
          )["input_ids"],
  }
  return res

def batchify_data(df, tokenizer, args):
  dataset = Dataset.from_pandas(df)
  data_tokenized = dataset.map(
    lambda batch: batch_tokenize(batch, tokenizer, args),
    batched=True
  )
  return data_tokenized

def compute_loss(batch, model, tokenizer, args):
  batch_feature, batch_label = make_batch_data(batch, tokenizer, args)
  with torch.no_grad():
    outputs = model(input_ids=batch_feature['input_ids'],
                    labels=batch_label['input_ids'])
    eval_loss = outputs.loss.item()
  return [eval_loss] 

def test_ppl(val_df, model, tokenizer, args):
  loss_dict = Dataset.from_pandas(val_df).map(
    lambda batch: {'loss': compute_loss(batch, model, tokenizer, args)},
    batched=True,
    batch_size=1,
  )
  
  eval_loss = 0.
  nb_eval_steps = 0
  for item in list(loss_dict):
      eval_loss += item['loss']
      nb_eval_steps += 1
  eval_loss = eval_loss / nb_eval_steps
  ppl = torch.exp(torch.tensor(eval_loss))
  return ppl.item()

def prepare_eval(output_list):
    ref_list, pred_list = [], []
    for item in output_list:
        pred_list.append({"generated": item['generated']})
        ref_list.append({"target": [item['target']]})
    return ref_list, pred_list

In [24]:
# Replacing dataset constructing function from utilities with a custom one.
def parse_data(t_split='train'):

  # Split handling - validation set further split into 50% dev/test.
  if t_split == 'train':
    df = pd.DataFrame(load_dataset('squad')['train'])
  elif t_split in ['val','test']:
    vt_df = pd.DataFrame(load_dataset('squad')['validation'])
    df_val = vt_df.sample(frac=0.5,random_state=266)
    if t_split == 'test':
      df_test = vt_df.drop(df_val.index)
      df = df_test
    else:
      df = df_val
  else:
    raise Exception("Invalid choice of dataset split.")
  

  df['answer_text'] = df['answers'].apply(lambda x: x['text'][0])
  df['source'] = 'answer: ' + df['answer_text'] + ' context: ' + df['context'] + '</s>'
  df['target'] = df['question']

  return df                                                                                                                       

In [25]:
# Primary functions from GP-VAE implementation.

if version.parse(torch.__version__) < version.parse("1.6"):
    from transformers.file_utils import is_apex_available

    if is_apex_available():
        from apex import amp
    _use_apex = True
else:
    _use_native_amp = True
    from torch.cuda.amp import autocast


def init_linear_wt(linear):
    linear.weight.data.normal_(std=1e-4)
    if linear.bias is not None:
        linear.bias.data.normal_(std=1e-4)


class Seq2SeqModel(T5ForConditionalGeneration):
    def __init__(self, config, *args, **kwargs):
        super().__init__(config, **kwargs)
        self.mean = nn.Linear(config.d_model, config.d_model)
        init_linear_wt(self.mean)
        self.logvar = nn.Linear(config.d_model, config.d_model)
        init_linear_wt(self.logvar)
        self.latent2hidden = nn.Linear(config.d_model, config.d_model, bias=False)
        init_linear_wt(self.latent2hidden)
        self.hidden2latent = nn.Linear(config.d_model, config.d_model)
        init_linear_wt(self.hidden2latent)

    def kernel_func(self, x, y):
        """
        x, y - B x 2H
        """
        cov_xy = self.kernel_v * torch.exp(-0.5 * torch.sum(torch.pow((x - y) / self.kernel_r, 2), dim=1))
        return cov_xy

    def prior(self, hidden_states):
        """
        GP prior p(z|x) = N(mu(x), K(x, x'))
        
        enc_outputs - B x L x 2H
        """
        b, l, h = list(hidden_states.size())
        mean = hidden_states.sum(dim=2)  # B x L
        mean = self.hidden2latent(hidden_states) # B x L x K
        var = torch.zeros((b, l, l), requires_grad=False).cuda()  # B x L x L
        for i in range(l):
            for j in range(l):
                var[:, i, j] = self.kernel_func(hidden_states[:, i, :], hidden_states[:, j, :])
        return mean, var

    def posterior(self, hidden_states):
        """
        variational posterior q(z|x) = N(mu(x), f(x))
        
        mean, logvar - B x L x K
        """
        mean = self.mean(hidden_states)  # B x L x K
        logvar = self.logvar(hidden_states)  # B x L x K
        mean = mean.sum(dim=2)  # B x L
        x_var = torch.exp(logvar).sum(dim=2)  # B x L
        var_batch = []
        for b in range(mean.size(0)):
            identity_matrix = torch.eye(x_var.size(1)).cuda()
            var_batch.append(x_var[b] * identity_matrix)
        var = torch.stack(var_batch, dim=0)  # B x L x L
        return mean, var

    def reparameterize(self, mu, logvar):
        std = logvar.mul(0.5).exp_() * self.scaler
        eps = torch.cuda.FloatTensor(std.size()).normal_()
        eps = Variable(eps)
        return eps.mul(std).add_(mu)

    def compute_kld(self, p_mean, p_var, q_mean, q_var):
        k = p_var.size(1)
        log_det = torch.logdet(p_var) - torch.logdet(q_var)
        if torch.isnan(log_det).int().sum() > 0:
            if torch.isnan(q_var).int().sum() > 0:
                print('q_var has nan!!!')
                print(q_var)
        try:
            p_var_inv = torch.inverse(p_var)  # B x L x L
            trace_batch = torch.matmul(p_var_inv, q_var)  # B x L x L
            trace_list = [torch.trace(trace_batch[i]) for i in range(trace_batch.size(0))]
            trace = torch.stack(trace_list, dim=0)  # B
            mean_diff = p_mean - q_mean.unsqueeze(2)  # B x L x 1
           
            mean = torch.matmul(torch.matmul(mean_diff.transpose(1, 2), p_var_inv), mean_diff)  # B x K x K

            kld = log_det - k + trace + torch.mean(mean, dim=(1,2))
            kld = 0.5 * kld  # B
        except:
            zeros = torch.zeros(p_mean.size(0)).cuda()
            kld = zeros
            print('zero kld!!!')
        return kld.mean()

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            decoder_input_ids=None,
            decoder_attention_mask=None,
            head_mask=None,
            decoder_head_mask=None,
            encoder_outputs=None,
            past_key_values=None,
            inputs_embeds=None,
            decoder_inputs_embeds=None,
            labels=None,
            use_cache=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
    ):
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
        if head_mask is not None and decoder_head_mask is None:
            if self.config.num_layers == self.config.num_decoder_layers:
                decoder_head_mask = head_mask

        # Encode if needed (training, first prediction pass)
        if encoder_outputs is None:
            # Convert encoder inputs in embeddings if needed
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                inputs_embeds=inputs_embeds,
                head_mask=head_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],
                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )

        hidden_states = encoder_outputs[0]
        # added z code here
        posterior_mean = self.mean(hidden_states)  # B x L x K
        posterior_logvar = self.logvar(hidden_states)  # B x L x K

        if self.from_mean:
            z = posterior_mean
        else:
            z = self.reparameterize(posterior_mean, posterior_logvar)

        input_proj = self.latent2hidden(z)  # B x L x K
        hidden_states = hidden_states + input_proj

        if self.model_parallel:
            torch.cuda.set_device(self.decoder.first_device)

        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
            # get decoder inputs from shifting lm labels to the right
            decoder_input_ids = self._shift_right(labels)

        # If decoding with past key value states, only the last tokens
        # should be given as an input
        if past_key_values is not None:
            assert labels is None, "Decoder should not use cached key value states when training."
            if decoder_input_ids is not None:
                decoder_input_ids = decoder_input_ids[:, -1:]
            if decoder_inputs_embeds is not None:
                decoder_inputs_embeds = decoder_inputs_embeds[:, -1:]

        # Set device for model parallelism
        if self.model_parallel:
            torch.cuda.set_device(self.decoder.first_device)
            hidden_states = hidden_states.to(self.decoder.first_device)
            if decoder_input_ids is not None:
                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
            if attention_mask is not None:
                attention_mask = attention_mask.to(self.decoder.first_device)
            if decoder_attention_mask is not None:
                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)

        # Decode
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            inputs_embeds=decoder_inputs_embeds,
            past_key_values=past_key_values,
            encoder_hidden_states=hidden_states,
            encoder_attention_mask=attention_mask,
            head_mask=decoder_head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = decoder_outputs[0]

        # Set device for model parallelism
        if self.model_parallel:
            torch.cuda.set_device(self.encoder.first_device)
            self.lm_head = self.lm_head.to(self.encoder.first_device)
            sequence_output = sequence_output.to(self.lm_head.weight.device)

        if self.config.tie_word_embeddings:
            # Rescale output before projecting on vocab
            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
            sequence_output = sequence_output * (self.model_dim ** -0.5)

        lm_logits = self.lm_head(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
            # kl_loss
            prior_mean, prior_logvar = self.prior(hidden_states)
            posterior_mean, posterior_logvar = self.posterior(hidden_states)
            kl_loss = self.compute_kld(prior_mean, prior_logvar, posterior_mean, posterior_logvar)
            loss = loss + kl_loss

        if not return_dict:
            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
            return ((loss,) + output) if loss is not None else output

        return Seq2SeqLMOutput(
            loss=loss,
            logits=lm_logits,
            past_key_values=decoder_outputs.past_key_values,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )

    def prepare_inputs_for_generation(
            self,
            input_ids,
            past=None,
            attention_mask=None,
            head_mask=None,
            decoder_head_mask=None,
            #        cross_attn_head_mask=None,
            use_cache=None,
            encoder_outputs=None,
            **kwargs
    ):

        # cut decoder_input_ids if past is used
        if past is not None:
            input_ids = input_ids[:, -1:]

        return {
            "decoder_input_ids": input_ids,
            "past_key_values": past,
            "encoder_outputs": encoder_outputs,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "use_cache": use_cache,
        }


class Seq2SeqTrainer(Trainer):
    """Class to finetune a Seq2Seq model."""

    def __init__(
            self,
            num_beams=4,
            max_length=32,
            *args, **kwargs
    ):
        super().__init__(*args, **kwargs)
        self.num_beams = num_beams
        self.max_length = max_length

    def compute_loss(self, model, inputs):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        outputs = model(input_ids=inputs['input_ids'],
                        # decoder_input_ids=inputs['labels'][:,:-1],
                        labels=inputs['labels'])
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if self.label_smoother is not None and "labels" in inputs:
            return self.label_smoother(outputs, inputs["labels"])
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            return outputs["loss"] if isinstance(outputs, dict) else outputs[0]

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        """
        Runs the model to either generate a sequence and/or compute the loss.
        """
        has_labels = all(inputs.get(k) is not None for k in self.label_names)
        inputs = self._prepare_inputs(inputs)
        # Compute loss with labels first.
        with torch.no_grad():
            if self.args.fp16 and _use_native_amp:
                with autocast():
                    outputs = model(input_ids=inputs['input_ids'],
                                    # decoder_input_ids=inputs['labels'][:,:-1],
                                    labels=inputs['labels'])
            else:
                outputs = model(input_ids=inputs['input_ids'],
                                # decoder_input_ids=inputs['labels'][:,:-1],
                                labels=inputs['labels'])
            if has_labels:
                loss = outputs[0].mean().detach()
            else:
                loss = None
        # If we're only computing the conditional log-likelihood, return.
        if prediction_loss_only:
            return (loss, None, None)
        # Otherwise run model.generate() to get predictions.
        if isinstance(model, torch.nn.DataParallel):
            preds = model.module.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                num_beams=self.num_beams,
                max_length=self.max_length,
            )
        else:
            preds = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                num_beams=self.num_beams,
                max_length=self.max_length,
            )
        if len(preds) == 1:
            preds = preds[0]
        # Pad predictions if necessary so they can be concatenated across batches.
        if preds.shape[-1] < self.max_length:
            preds = torch.nn.functional.pad(
                preds, (0, self.max_length - preds.shape[-1]),
                mode='constant',
                value=self.tokenizer.pad_token_id
            )
        # Post-process labels.
        if has_labels:
            labels = inputs.get('labels')
        else:
            labels = None
        return (loss, preds, labels)


def train(args):
    # Load the dataset
    trn_df = parse_data('train', args)
    val_df = parse_data('val', args)

    # Load the pre-trained model
    ckpt_path = None
    if args.task == 'train':
        ckpt_path = args.model_name
    else:
        ckpt_path = f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}/checkpoint-{args.ckpt}"
        # update timestamp and create new path for ckpt
        args.timestamp = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())

    tokenizer = T5TokenizerFast.from_pretrained(ckpt_path)
    print(f"Vocab size: {len(tokenizer)}")

    train_data_tokenized = batchify_data(trn_df, tokenizer, args)
    valid_data_tokenized = batchify_data(val_df, tokenizer, args)

    model = Seq2SeqModel.from_pretrained(ckpt_path)
    model = model.to('cuda:0')
    model.kernel_v = args.kernel_v
    model.kernel_r = args.kernel_r
    model.from_mean = args.from_mean
    model.scaler = 1.0

    # Training Setup
    train_args = TrainingArguments(
        output_dir=f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}",
        do_train=True,
        do_eval=True,
        save_strategy="steps",
        save_steps=300,
        #save_steps=120,
        evaluation_strategy="steps",
        eval_steps=300,
        #eval_steps=120,
        logging_steps=100,
        # optimization args, the trainer uses the Adam optimizer
        # and has a linear warmup for the learning rate
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        gradient_accumulation_steps=1,
        learning_rate=1e-04,
        num_train_epochs=args.epochs,
        warmup_steps=0,
        lr_scheduler_type='constant',
        # misc args
        seed=42,
        save_total_limit=5,  # limit the total amount of checkpoints
        disable_tqdm=False,
        metric_for_best_model="eval_loss",
        load_best_model_at_end=True,
        greater_is_better=False,
        local_rank=args.local_rank
    )

    trainer = Seq2SeqTrainer(
        num_beams=args.beam_size,
        max_length=args.decoder_max_length,
        model=model,
        args=train_args,
        train_dataset=train_data_tokenized,
        eval_dataset=valid_data_tokenized,
        tokenizer=tokenizer,
    )

    # Now that we have the trainer set up, we can finetune.
    trainer.train()


def beam_generate_sentences(batch,
                            model,
                            tokenizer,
                            args,
                            device='cuda:0'):
    # Create batch inputs.
    features = make_batch_inputs(
        batch=batch,
        tokenizer=tokenizer,
        args=args,
        device=device)
    # Generate with beam search.
    generated_ids = model.generate(
        input_ids=features['input_ids'],
        attention_mask=features['attention_mask'],
        num_beams=args.beam_size,
        max_length=args.max_generation_length,
        num_return_sequences=args.num_return_sequences,
    )
    # Use model tokenizer to decode to text.
    generated_sentences = [
        tokenizer.decode(gen_ids.tolist(), skip_special_tokens=True)
        for gen_ids in generated_ids
    ]
    #print(generated_sentences)
    return ['\t'.join(generated_sentences)]

def nucleus_search_sentences(batch,
                            model,
                            tokenizer,
                            args,
                            device='cuda:0'):
    # Create batch inputs.
    features = make_batch_inputs(
        batch=batch,
        tokenizer=tokenizer,
        args=args,
        device=device)
    # Generate with beam search.
    generated_ids = model.generate(
        input_ids=features['input_ids'],
        attention_mask=features['attention_mask'],
        do_sample=True, 
        max_length=args.max_generation_length,
        top_p=args.top_p, 
        top_k=args.top_k,
        num_return_sequences=args.num_return_sequences
    )
    # Use model tokenizer to decode to text.
    generated_sentences = [
        tokenizer.decode(gen_ids.tolist(), skip_special_tokens=True)
        for gen_ids in generated_ids
    ]
    #print(generated_sentences)
    return ['\t'.join(generated_sentences)]


def sample_sentences(batch,
                     model,
                     tokenizer,
                     args,
                     device='cuda:0'):
    # Create batch inputs.
    features = make_batch_inputs(
        batch=batch,
        tokenizer=tokenizer,
        args=args,
        device=device)

    generated_sentences = []
    for i in range(args.num_return_sequences):
        # Generate with beam search.
        generated_ids = model.generate(
            input_ids=features['input_ids'],
            attention_mask=features['attention_mask'],
            num_beams=args.beam_size,
            max_length=args.max_generation_length,
            num_return_sequences=1,
        )
        # Use model tokenizer to decode to text.
        generated_sentences += [
            tokenizer.decode(gen_ids.tolist(), skip_special_tokens=True)
            for gen_ids in generated_ids
        ]
    #print(generated_sentences)
    return ['\t'.join(generated_sentences)]


def test(args):
    te_df = parse_data('test', args)
    print('Data loaded!!!')

    # Load the model
    if args.timestamp == '0':
        tokenizer = T5TokenizerFast.from_pretrained(f"{args.model_name}")
    else:
        ckpt_path = f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}/checkpoint-{args.ckpt}"
        tokenizer = T5TokenizerFast.from_pretrained(ckpt_path)
    print(f"Vocab size: {len(tokenizer)}")

    if args.timestamp == '0':
        model = Seq2SeqModel.from_pretrained(f"{args.model_name}")
    else:
        ckpt_path = f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}/checkpoint-{args.ckpt}"
        model = Seq2SeqModel.from_pretrained(ckpt_path)
    model = model.to('cuda:0')
    model.kernel_v = args.kernel_v
    model.kernel_r = args.kernel_r
    model.from_mean = args.from_mean
    model.scaler = args.scaler

    # Make predictions
    if args.from_mean:
        test_output = Dataset.from_pandas(te_df).map(
            lambda batch: {'generated': beam_generate_sentences(
                batch,
                model,
                tokenizer,
                args,
                device='cuda:0')
            },
            batched=True,
            batch_size=1,
        )
    else:
        test_output = Dataset.from_pandas(te_df).map(
            lambda batch: {'generated': sample_sentences(
                batch,
                model,
                tokenizer,
                args,
                device='cuda:0')
            },
            batched=True,
            batch_size=1,
        )

    # prepare evaluation data
    ref_list, pred_list = prepare_eval(list(test_output))
    reference_dict = {
        "language": "en",
        "values": ref_list,
    }
    prediction_dict = {
        "language": "en",
        "values": pred_list,
    }

    if args.timestamp == '0':
        os.makedirs(f"{args.model_name}_{args.dataset}_{args.flag}_{args.timestamp}")

    with open(
            f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}/refs.json",
            'w') as f:
        f.write(json.dumps(reference_dict, indent=2))
    if args.from_mean:
        with open(
                f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}/outs_mean.json",
                'w') as f:
            f.write(json.dumps(prediction_dict, indent=2))
    else:
        with open(
                f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}/outs.json",
                'w') as f:
            f.write(json.dumps(prediction_dict, indent=2))

In [26]:
p = argparse.ArgumentParser(description='Hyperparams')
p.add_argument('-t', '--task', type=str, default="train",
                help="specify the task to do: (train)ing, ft(finetune), (eval)uation")
p.add_argument('-c', '--ckpt', type=str, default="600",
                help="Model checkpoint")
p.add_argument('-time', '--timestamp', type=str, default='2022-07-19-01-04-50',
                help="Model checkpoint")
p.add_argument('-f', '--flag', type=str, default='gpvae',
                help="Model checkpoint")
p.add_argument('-d', '--dataset', type=str, default="GYAFC/em",
                help="specify the dataset: GYAFC/em, GYAFC/fr")
p.add_argument('--model_name', type=str, default="t5-base",
                help="specify the model name: t5-base, facebook/blenderbot-400M-distill")
p.add_argument('-v', '--kernel_v', type=float, default=100,
                help="Hyper-parameter for prior kernel,  control the signal variance")
p.add_argument('-r', '--kernel_r', type=float, default=0.001,
                help="Hyper-parameter for prior kernel.")
p.add_argument('-s', '--scaler', type=float, default=1.0)
p.add_argument('--from_mean', action='store_true',
                help="specify whether sample from mean during generation")
p.add_argument('-bz', '--batch_size', type=int, default=8)
p.add_argument('-e', '--epochs', type=int, default=10)
p.add_argument('--encoder_max_length', type=int, default=256)
p.add_argument('--decoder_max_length', type=int, default=48)
p.add_argument('--max_generation_length', type=int, default=96)
p.add_argument('--beam_size', type=int, default=5)
p.add_argument('--num_return_sequences', type=int, default=5)
p.add_argument('--local_rank', type=int, default=-1,
                help="Multiple GPU training")
args = p.parse_args()

## jupyter fix for bad flag
#args.flag = 't5base'

### Generate predictions on validation set.

In [113]:
# Get tokenizer, model, and val set.
#ckpt_path = f"t5-base_GYAFC/em_gpvae_64.0_0.0001_2022-07-12-02-30-44/checkpoint-10800"
#ckpt_path = f"t5-base_GYAFC/em_t5gpp128enc_64.0_0.0001_2022-07-16-14-35-44/checkpoint-4500" # Pass A
#ckpt_path = f"t5-base_GYAFC/em_t5gpp128enc_64.0_0.0001_2022-07-17-17-38-30/checkpoint-5100" # Pass B
#ckpt_path = f"t5-base_GYAFC/em_t5gpp128enc_64.0_0.0001_2022-07-19-01-04-50/checkpoint-600" # Pass C
#ckpt_path = "t5-base_GYAFC/em_grid_e2_100.0_0.001_2022-07-22-02-22-41/checkpoint-5400" # Best from grid search
#ckpt_path = "t5-base_GYAFC/em_grid_e3_100.0_0.01_2022-07-22-08-10-31/checkpoint-5400" # 3rd best from grid search
#ckpt_path = "t5-base_GYAFC/em_grid_e4_100.0_0.1_2022-07-22-13-53-38/checkpoint-5400" # 2nd best from grid search
ckpt_path = "t5-base_GYAFC/em_t5gpp256enc_64.0_0.0001_2022-07-16-04-29-17/checkpoint-10800" # GPP with 256-length encoder

tokenizer = T5TokenizerFast.from_pretrained(ckpt_path)
model = Seq2SeqModel.from_pretrained(ckpt_path)
val_df = parse_data('val')

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [114]:
# Other steps
model = model.to('cuda:0')
model.kernel_v = args.kernel_v
model.kernel_r = args.kernel_r
model.from_mean = args.from_mean
model.scaler = args.scaler

In [43]:
# Make predictions
test_output = Dataset.from_pandas(val_df).map(
    lambda batch: {'generated': beam_generate_sentences(
        batch,
        model,
        tokenizer,
        args,
        device='cuda:0')
    },
    batched=True,
    batch_size=1,
)

# prepare evaluation data
ref_list, pred_list = prepare_eval(list(test_output))
reference_dict = {
    "language": "en",
    "values": ref_list,
}
prediction_dict = {
    "language": "en",
    "values": pred_list,
}

  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


### Score predictions

In [68]:
# Calculate BLEU-4.
metric = datasets.load_metric('sacrebleu')
fin_targets = [reference_dict['values'][x]['target'] for x in range(0,len(reference_dict['values']))]
fin_preds = [prediction_dict['values'][x]['generated'].split('\t')[0] for x in range(0,len(prediction_dict['values']))]
for model_predictions, gold_references in zip(fin_preds,fin_targets):
    metric.add(predictions=model_predictions, references=gold_references)
final_score = metric.compute()
final_score

{'score': 11.06835262894281,
 'counts': [22716, 7218, 3534, 1853],
 'totals': [59118, 53833, 48548, 43263],
 'precisions': [38.42484522480463,
  13.408132558096336,
  7.279393589849222,
  4.283105656103368],
 'bp': 0.9831604147503054,
 'sys_len': 59118,
 'ref_len': 60122}

In [72]:
val_df.iloc[25].source

"answer: Newcastle Student Radio context: NE1fm launched on 8 June 2007, the first full-time community radio station in the area. Newcastle Student Radio is run by students from both of the city's universities, broadcasting from Newcastle University's student's union building during term time. Radio Tyneside has been the voluntary hospital radio service for most hospitals across Newcastle and Gateshead since 1951, broadcasting on Hospedia  and online. The city also has a Radio Lollipop station based at the Great North Children's Hospital in the Newcastle Royal Victoria Infirmary.</s>"

### Save or Load Predictions

In [45]:
# Save pred/target lists.
with open('reference_dict_GPP256b.json', 'w') as fp:
    json.dump(reference_dict, fp)
with open('prediction_dict_GPP256b.json', 'w') as fp:
    json.dump(prediction_dict, fp)

In [67]:
# Open pred/target lists.
with open('reference_dict_GPP256b.json', 'r') as fp:
    reference_dict = json.load(fp)
with open('prediction_dict_GPP256b.json', 'r') as fp:
    prediction_dict = json.load(fp)

### Beam experimentation

In [7]:
# Make predictions
args.beam_size = 10
args.num_return_sequences = 10
test_output = Dataset.from_pandas(val_df[:50]).map(
    lambda batch: {'generated': beam_generate_sentences(
        batch,
        model,
        tokenizer,
        args,
        device='cuda:0')
    },
    batched=True,
    batch_size=1,
)

# prepare evaluation data
ref_list, pred_list = prepare_eval(list(test_output))
reference_dict_test = {
    "language": "en",
    "values": ref_list,
}
prediction_dict_test = {
    "language": "en",
    "values": pred_list,
}

  0%|          | 0/50 [00:00<?, ?ba/s]

In [8]:
val_df.iloc[10].source

'answer: The UK context: The Social Charter was subsequently adopted in 1989 by 11 of the then 12 member states. The UK refused to sign the Social Charter and was exempt from the legislation covering Social Charter issues unless it agreed to be bound by the legislation. The UK subsequently was the only member state to veto the Social Charter being included as the "Social Chapter" of the 1992 Maastricht Treaty - instead, an Agreement on Social Policy was added as a protocol. Again, the UK was exempt from legislation arising from the protocol, unless it agreed to be bound by it. The protocol was to become known as "Social Chapter", despite not actually being a chapter of the Maastricht Treaty. To achieve aims of the Agreement on Social Policy the European Union was to "support and complement" the policies of member states. The aims of the Agreement on Social Policy are:</s>'

In [9]:
reference_dict_test['values'][10]

{'target': ['Which member state declined to sign the Social Charter?']}

In [13]:
prediction_dict_test['values'][17]['generated'].split('\t')

['What is the name of a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be',
 'What is the name of a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group of people?',
 'What is the name of a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group of people who are considered to be a group 

### Generate and save predictions for multiple sets

In [21]:
# Settings
paths = ['t5-base_GYAFC/256_backup/checkpoint-' + str(x) for x in range(3000,8400,300)]
names = ['ckpt' + str(x) for x in range(3000,8400,300)]

# Get data
val_df = parse_data('val')

# Main loop
for x in range (18):
    tokenizer = T5TokenizerFast.from_pretrained(paths[x])
    model = Seq2SeqModel.from_pretrained(paths[x])
    model = model.to('cuda:0')
    model.kernel_v = args.kernel_v
    model.kernel_r = args.kernel_r
    model.from_mean = True
    model.scaler = args.scaler
    
    # Make predictions
    test_output = Dataset.from_pandas(val_df).map(
        lambda batch: {'generated': beam_generate_sentences(
            batch,
            model,
            tokenizer,
            args,
            device='cuda:0')
        },
        batched=True,
        batch_size=1,
    )

    # prepare evaluation data
    ref_list, pred_list = prepare_eval(list(test_output))
    reference_dict = {
        "language": "en",
        "values": ref_list,
    }
    prediction_dict = {
        "language": "en",
        "values": pred_list,
    }
    
    # Save pred/target lists.
    with open(names[x] + '_reference_dict.json', 'w') as fp:
        json.dump(reference_dict, fp)
    with open(names[x] + '_prediction_dict.json', 'w') as fp:
        json.dump(prediction_dict, fp)
        
    # Calculate BLEU-4.
    metric = datasets.load_metric('sacrebleu')
    fin_targets = [reference_dict['values'][x]['target'] for x in range(0,len(reference_dict['values']))]
    fin_preds = [prediction_dict['values'][x]['generated'].split('\t')[0] for x in range(0,len(prediction_dict['values']))]
    for model_predictions, gold_references in zip(fin_preds,fin_targets):
        metric.add(predictions=model_predictions, references=gold_references)
    final_score = metric.compute()
    print(names[x], ' | ', final_score)
    print()

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt3000  |  {'score': 4.693644469523313, 'counts': [19122, 4904, 2224, 1149], 'totals': [91961, 86676, 81391, 76106], 'precisions': [20.79359728580594, 5.657852231298168, 2.7324888501185636, 1.5097364202559589], 'bp': 1.0, 'sys_len': 91961, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt3300  |  {'score': 5.807745533918182, 'counts': [19859, 5643, 2721, 1494], 'totals': [87695, 82410, 77125, 71840], 'precisions': [22.64553281258909, 6.847469967236986, 3.5280388978930306, 2.079621380846325], 'bp': 1.0, 'sys_len': 87695, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt3600  |  {'score': 6.212544972894582, 'counts': [21772, 7051, 3556, 1977], 'totals': [100376, 95091, 89806, 84521], 'precisions': [21.690443930820116, 7.415002471316949, 3.9596463487962943, 2.3390636646513885], 'bp': 1.0, 'sys_len': 100376, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt3900  |  {'score': 6.82551180183416, 'counts': [25177, 9225, 4782, 2690], 'totals': [116406, 111121, 105836, 100551], 'precisions': [21.62861020909575, 8.30176114325825, 4.518311349635285, 2.6752593211405156], 'bp': 1.0, 'sys_len': 116406, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt4200  |  {'score': 8.39951916576026, 'counts': [25999, 9731, 5095, 2882], 'totals': [101062, 95777, 90492, 85207], 'precisions': [25.725792088025173, 10.160059304425905, 5.630331963046457, 3.3823512152757402], 'bp': 1.0, 'sys_len': 101062, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt4500  |  {'score': 12.464174438135693, 'counts': [26210, 10239, 5598, 3257], 'totals': [75289, 70004, 64719, 59434], 'precisions': [34.8125224136328, 14.626307068167533, 8.649701015157836, 5.48002826664872], 'bp': 1.0, 'sys_len': 75289, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt4800  |  {'score': 12.213875750508853, 'counts': [26270, 10405, 5670, 3287], 'totals': [77350, 72065, 66780, 61495], 'precisions': [33.96250808015514, 14.438354263512107, 8.49056603773585, 5.345150012196114], 'bp': 1.0, 'sys_len': 77350, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt5100  |  {'score': 15.031966522012546, 'counts': [25591, 9996, 5428, 3147], 'totals': [62338, 57053, 51768, 46483], 'precisions': [41.05200680162982, 17.52055106655215, 10.485241848246021, 6.770217068605727], 'bp': 1.0, 'sys_len': 62338, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt5400  |  {'score': 13.711000069825705, 'counts': [27133, 10726, 5848, 3419], 'totals': [71902, 66617, 61332, 56047], 'precisions': [37.736085227114685, 16.100995241454886, 9.534989891084589, 6.100237300836798], 'bp': 1.0, 'sys_len': 71902, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt5700  |  {'score': 13.818584075234822, 'counts': [26432, 10558, 5752, 3348], 'totals': [70170, 64885, 59600, 54315], 'precisions': [37.668519310246545, 16.271865608384065, 9.651006711409396, 6.164043082021541], 'bp': 1.0, 'sys_len': 70170, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt6000  |  {'score': 13.803321544924831, 'counts': [26223, 10170, 5530, 3208], 'totals': [68302, 63017, 57732, 52447], 'precisions': [38.392726420895436, 16.138502308900772, 9.578743158040602, 6.116651095391538], 'bp': 1.0, 'sys_len': 68302, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt6300  |  {'score': 15.013156174785042, 'counts': [26137, 10138, 5522, 3225], 'totals': [63450, 58165, 52880, 47595], 'precisions': [41.19306540583136, 17.429725780108313, 10.44251134644478, 6.775921840529468], 'bp': 1.0, 'sys_len': 63450, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt6600  |  {'score': 15.177581264505072, 'counts': [25819, 10009, 5433, 3164], 'totals': [62042, 56757, 51472, 46187], 'precisions': [41.61535733857709, 17.634829184065403, 10.55525334162263, 6.850412453720744], 'bp': 1.0, 'sys_len': 62042, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt6900  |  {'score': 13.876068448931703, 'counts': [25429, 9707, 5265, 3045], 'totals': [65392, 60107, 54822, 49537], 'precisions': [38.88701981893809, 16.149533332224202, 9.603808689941994, 6.1469204836788665], 'bp': 1.0, 'sys_len': 65392, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt7200  |  {'score': 15.842365519783677, 'counts': [26146, 10195, 5510, 3142], 'totals': [60289, 55004, 49719, 44434], 'precisions': [43.36777853339747, 18.53501563522653, 11.082282427241095, 7.071161723004906], 'bp': 1.0, 'sys_len': 60289, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt7500  |  {'score': 11.651170887269602, 'counts': [22833, 7553, 3769, 1986], 'totals': [56908, 51623, 46338, 41053], 'precisions': [40.12265410838546, 14.63107529589524, 8.133713151193405, 4.837648892894551], 'bp': 0.9450881069496168, 'sys_len': 56908, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt7800  |  {'score': 17.360430015067553, 'counts': [27227, 11267, 6247, 3649], 'totals': [60934, 55649, 50364, 45079], 'precisions': [44.68277152328749, 20.246545310787255, 12.403701056310062, 8.094678231549059], 'bp': 1.0, 'sys_len': 60934, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


ckpt8100  |  {'score': 16.924576967521308, 'counts': [26493, 10762, 5934, 3455], 'totals': [58711, 53426, 48141, 42856], 'precisions': [45.12442302123963, 20.14375023396848, 12.326291518663925, 8.061881650177337], 'bp': 0.9762535171880073, 'sys_len': 58711, 'ref_len': 60122}



### Nucleus search experimentation

In [32]:
#ckpt_path = "t5-base_GYAFC/em_grid_e2_100.0_0.001_2022-07-22-02-22-41/checkpoint-5400" # Best from grid search
#ckpt_path = "t5-base_GYAFC/em_grid_e3_100.0_0.01_2022-07-22-08-10-31/checkpoint-5400" # 3rd best from grid search
#ckpt_path = "t5-base_GYAFC/em_grid_e4_100.0_0.1_2022-07-22-13-53-38/checkpoint-5400" # 2nd best from grid search
ckpt_path = "t5-base_GYAFC/256_backup/checkpoint-7800"

# Get model and data.
tokenizer = T5TokenizerFast.from_pretrained(ckpt_path)
model = Seq2SeqModel.from_pretrained(ckpt_path)
val_df = parse_data('val')
valid_data_tokenized = batchify_data(val_df, tokenizer, args)

# Other steps.
model = model.to('cuda:0')
model.kernel_v = 100 #args.kernel_v
model.kernel_r = 0.001 #args.kernel_r
model.from_mean = args.from_mean
model.scaler = args.scaler

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [57]:
# Make predictions
args.top_p = 0.5
args.top_k = 5
args.num_return_sequences = 10

test_output2 = Dataset.from_pandas(val_df).map(
    lambda batch: {'generated': nucleus_search_sentences(
        batch,
        model,
        tokenizer,
        args,
        device='cuda:0')
    },
    batched=True,
    batch_size=1,
)

# prepare evaluation data
ref_list2, pred_list2 = prepare_eval(list(test_output))
reference_dict2 = {
    "language": "en",
    "values": ref_list2,
}
prediction_dict2 = {
    "language": "en",
    "values": pred_list2,
}

  0%|          | 0/5285 [00:00<?, ?ba/s]

In [58]:
# Calculate BLEU-4.
metric = datasets.load_metric('sacrebleu')
fin_targets = [reference_dict2['values'][x]['target'] for x in range(0,len(reference_dict2['values']))]
fin_preds = [prediction_dict2['values'][x]['generated'].split('\t')[0] for x in range(0,len(prediction_dict2['values']))]
for model_predictions, gold_references in zip(fin_preds,fin_targets):
    metric.add(predictions=model_predictions, references=gold_references)
final_score = metric.compute()
final_score

{'score': 8.60465608915507,
 'counts': [205, 58, 26, 13],
 'totals': [550, 500, 450, 400],
 'precisions': [37.27272727272727, 11.6, 5.777777777777778, 3.25],
 'bp': 0.9064840734837171,
 'sys_len': 550,
 'ref_len': 604}

In [59]:
reference_dict_test['values'][10]['target']

['Which member state declined to sign the Social Charter?']

In [60]:
prediction_dict_test['values'][10]['generated'].split('\t')

['What was the purpose of the Agreement on Social Policy?',
 'What did the UK refuse to sign the Treaty of Maastricht?',
 'What did the UK refuse to sign?',
 'What was the purpose of the agreement on Social Policy?',
 'What was the purpose of the Treaty of Maastricht?',
 'What was the name of the agreement that the UK refused to sign?',
 'What did the UK refuse to sign the Social Charter?',
 'What did the UK refuse to sign the Treaty on Social Policy?',
 'What was the purpose of the agreement?',
 'What was the purpose of the agreement on social policy?']

In [64]:
val_df.iloc[10].source

'answer: The UK context: The Social Charter was subsequently adopted in 1989 by 11 of the then 12 member states. The UK refused to sign the Social Charter and was exempt from the legislation covering Social Charter issues unless it agreed to be bound by the legislation. The UK subsequently was the only member state to veto the Social Charter being included as the "Social Chapter" of the 1992 Maastricht Treaty - instead, an Agreement on Social Policy was added as a protocol. Again, the UK was exempt from legislation arising from the protocol, unless it agreed to be bound by it. The protocol was to become known as "Social Chapter", despite not actually being a chapter of the Maastricht Treaty. To achieve aims of the Agreement on Social Policy the European Union was to "support and complement" the policies of member states. The aims of the Agreement on Social Policy are:</s>'

### Nucleus search and top-k combination grid search

In [31]:
pk_list = [(0.5,100),(0.6,5)]

for x in range(len(pk_list)):
    # Make predictions
    args.top_p = pk_list[x][0]
    args.top_k = pk_list[x][1]

    test_output = Dataset.from_pandas(val_df).map(
        lambda batch: {'generated': nucleus_search_sentences(
            batch,
            model,
            tokenizer,
            args,
            device='cuda:0')
        },
        batched=True,
        batch_size=1,
    )

    # prepare evaluation data
    ref_list, pred_list = prepare_eval(list(test_output))
    reference_dict_test = {
        "language": "en",
        "values": ref_list,
    }
    prediction_dict_test = {
        "language": "en",
        "values": pred_list,
    }
    
    # Calculate BLEU-4.
    metric = datasets.load_metric('sacrebleu')
    fin_targets = [reference_dict_test['values'][x]['target'] for x in range(0,len(reference_dict_test['values']))]
    fin_preds = [prediction_dict_test['values'][x]['generated'].split('\t')[0] for x in range(0,len(prediction_dict_test['values']))]
    for model_predictions, gold_references in zip(fin_preds,fin_targets):
        metric.add(predictions=model_predictions, references=gold_references)
    final_score = metric.compute()
    print('p=',args.top_p,' ',
          'k=',args.top_k,' ',
        final_score)

  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


p= 0.5   k= 100   {'score': 14.137242087308438, 'counts': [25007, 9072, 4705, 2606], 'totals': [56196, 50911, 45626, 40341], 'precisions': [44.499608513061425, 17.819331775058437, 10.312102748432912, 6.459929104385117], 'bp': 0.9325219175416379, 'sys_len': 56196, 'ref_len': 60122}


  0%|          | 0/5285 [00:00<?, ?ba/s]

p= 0.6   k= 5   {'score': 13.973700972325515, 'counts': [24883, 9097, 4658, 2514], 'totals': [55986, 50701, 45416, 40131], 'precisions': [44.44503983138642, 17.94244689453857, 10.256297340144442, 6.2644838155042235], 'bp': 0.9287872168634487, 'sys_len': 55986, 'ref_len': 60122}


### Model Structure

In [100]:
model

Seq2SeqModel(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dropout(p=0.1, in

In [112]:
model.latent2hidden

Linear(in_features=768, out_features=768, bias=False)

### Test on novel examples

In [166]:
# Standardize beam size and return sequences.
args.beam_size = 10
args.num_return_sequences = 10

# Example A - Superhero Question
novel_answer = "Batman"
novel_context = "The Justice League is made up of many heroes. Superman is an alien with super strength, x-ray vision, and the ability to fly. Batman uses his vast wealth to buy gadgets. Wonder Woman is an Amazon princess with an invisible jet, bulletproof bracelets, and a lasso of truth. The Flash is just really fast."
novel_df = pd.DataFrame()
novel_df['source'] = ["answer: " + novel_answer + " context: " + novel_context]
novel_df['target'] = ['Who is the richest member of the Justice League?']
novel_df['id'] = 0


novel_example_output = Dataset.from_pandas(novel_df).map(
    lambda batch: {'generated': beam_generate_sentences(
        batch,
        model,
        tokenizer,
        args,
        device='cuda:0')
    },
    batched=True,
    batch_size=1,
)

novel_example_output

# Prepare evaluation data
novel_refs_list, novel_preds_list = prepare_eval(list(novel_example_output))

# Show output
novel_preds_list[0]['generated'].split('\t')

  0%|          | 0/1 [00:00<?, ?ba/s]

answer: Batman context: The Justice League is made up of many heroes. Superman is an alien with super strength, x-ray vision, and the ability to fly. Batman uses his vast wealth to buy gadgets. Wonder Woman is an Amazon princess with an invisible jet, bulletproof bracelets, and a lasso of truth. The Flash is just really fast.
Who is the richest member of the Justice League?



['What is the name of the superhero in the Justice League?',
 "What is Superman's role in the Justice League?",
 "What is Batman's role in the Justice League?",
 'What is the name of the superhero who has the ability to fly?',
 "What is the name of the superhero in Batman's Justice League?",
 "What is Superman's name?",
 "What is Superman's nickname?",
 "What is the name of Batman's favorite superhero?",
 "What is Superman's role in the Justice League called?",
 'What is the name of the superhero who has the power to fly?']

In [161]:
# Standardize beam size and return sequences.
args.beam_size = 10
args.num_return_sequences = 10

# Example B - Machine Learning
novel_answer = "Stochastic gradient descent"
novel_context = "Stochastic gradient descent (often abbreviated SGD) is an iterative method for optimizing an objective function with suitable smoothness properties (e.g. differentiable or subdifferentiable). It can be regarded as a stochastic approximation of gradient descent optimization, since it replaces the actual gradient (calculated from the entire data set) by an estimate thereof (calculated from a randomly selected subset of the data). Especially in high-dimensional optimization problems this reduces the very high computational burden, achieving faster iterations in trade for a lower convergence rate."
novel_df = pd.DataFrame()
novel_df['source'] = ["answer: " + novel_answer + " context: " + novel_context]
novel_df['target'] = ['What does SGD stand for?']
novel_df['id'] = 0


novel_example_output = Dataset.from_pandas(novel_df).map(
    lambda batch: {'generated': beam_generate_sentences(
        batch,
        model,
        tokenizer,
        args,
        device='cuda:0')
    },
    batched=True,
    batch_size=1,
)

novel_example_output

# Prepare evaluation data
novel_refs_list, novel_preds_list = prepare_eval(list(novel_example_output))

# Show output
novel_preds_list[0]['generated'].split('\t')

  0%|          | 0/1 [00:00<?, ?ba/s]

answer: Stochastic gradient descent context: Stochastic gradient descent (often abbreviated SGD) is an iterative method for optimizing an objective function with suitable smoothness properties (e.g. differentiable or subdifferentiable). It can be regarded as a stochastic approximation of gradient descent optimization, since it replaces the actual gradient (calculated from the entire data set) by an estimate thereof (calculated from a randomly selected subset of the data). Especially in high-dimensional optimization problems this reduces the very high computational burden, achieving faster iterations in trade for a lower convergence rate.
What does SGD stand for?



['What does SGD stand for?',
 'What is the acronym for SGD?',
 'What is an example of a stochastic gradient descent?',
 'What is a synonym for SGD?',
 'What is SGD?',
 'What is SGD also known as?',
 'What is the acronym of SGD?',
 'What is an example of a stochastic approach to gradient descent?',
 'What is the name of the process that optimizes an objective function?',
 'What is an example of a stochastic gradient descent technique?']

In [163]:
# Standardize beam size and return sequences.
args.beam_size = 10
args.num_return_sequences = 10

# Example C - World History
novel_answer = "1066"
novel_context = "William I, usually known as William the Conqueror and sometimes William the Bastard, was the first Norman king of England, reigning from 1066 until his death in 1087. A descendant of Rollo, he was Duke of Normandy from 1035 onward. By 1060, following a long struggle to establish his throne, his hold on Normandy was secure. In 1066, following the death of Edward the Confessor, William invaded England, leading an army of Normans to victory over the Anglo-Saxon forces of Harold Godwinson at the Battle of Hastings, and suppressed subsequent English revolts in what has become known as the Norman Conquest. The rest of his life was marked by struggles to consolidate his hold over England and his continental lands, and by difficulties with his eldest son, Robert Curthose."
novel_df = pd.DataFrame()
novel_df['source'] = ["answer: " + novel_answer + " context: " + novel_context]
novel_df['target'] = ['At which battle did William the Conqueror defeat Harold Godwinson?']
novel_df['id'] = 0


novel_example_output = Dataset.from_pandas(novel_df).map(
    lambda batch: {'generated': beam_generate_sentences(
        batch,
        model,
        tokenizer,
        args,
        device='cuda:0')
    },
    batched=True,
    batch_size=1,
)

novel_example_output

# Prepare evaluation data
novel_refs_list, novel_preds_list = prepare_eval(list(novel_example_output))

# Show output
novel_preds_list[0]['generated'].split('\t')

  0%|          | 0/1 [00:00<?, ?ba/s]

answer: 1066 context: William I, usually known as William the Conqueror and sometimes William the Bastard, was the first Norman king of England, reigning from 1066 until his death in 1087. A descendant of Rollo, he was Duke of Normandy from 1035 onward. By 1060, following a long struggle to establish his throne, his hold on Normandy was secure. In 1066, following the death of Edward the Confessor, William invaded England, leading an army of Normans to victory over the Anglo-Saxon forces of Harold Godwinson at the Battle of Hastings, and suppressed subsequent English revolts in what has become known as the Norman Conquest. The rest of his life was marked by struggles to consolidate his hold over England and his continental lands, and by difficulties with his eldest son, Robert Curthose.
At which battle did William the Conqueror defeat Harold Godwinson?



['What year was William the Conqueror king of England?',
 'What year was William I king of England?',
 'What was the name of the first Norman king of England?',
 "What was William the Conqueror's name?",
 'What year did William I reign as king of England?',
 'What year was William the Conqueror born?',
 'What year was William the Conqueror king of England crowned?',
 "What was William the Conqueror's title?",
 'What year was William the Conqueror king of England born?',
 'What year was William the Conqueror king?']

### Final predictions on test set

In [33]:
# Make predictions (Beam)
test_df = parse_data('test')

test_output = Dataset.from_pandas(test_df).map(
    lambda batch: {'generated': beam_generate_sentences(
        batch,
        model,
        tokenizer,
        args,
        device='cuda:0')
    },
    batched=True,
    batch_size=1,
)

# prepare evaluation data
ref_list, pred_list = prepare_eval(list(test_output))
reference_dict = {
    "language": "en",
    "values": ref_list,
}
prediction_dict = {
    "language": "en",
    "values": pred_list,
}

# Calculate BLEU-4.
metric = datasets.load_metric('sacrebleu')
fin_targets = [reference_dict['values'][x]['target'] for x in range(0,len(reference_dict['values']))]
fin_preds = [prediction_dict['values'][x]['generated'].split('\t')[0] for x in range(0,len(prediction_dict['values']))]
for model_predictions, gold_references in zip(fin_preds,fin_targets):
    metric.add(predictions=model_predictions, references=gold_references)
final_score = metric.compute()
print(final_score)

# Save pred/target lists.
with open('FINAL_reference_dict_GPP256.json', 'w') as fp:
    json.dump(reference_dict, fp)
with open('FINAL_prediction_dict_GPP256.json', 'w') as fp:
    json.dump(prediction_dict, fp)

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (667 > 512). Running this sequence through the model will result in indexing errors


{'score': 16.56587608025032, 'counts': [26599, 10808, 5858, 3296], 'totals': [60367, 55082, 49797, 44512], 'precisions': [44.062153163152054, 19.621654987110126, 11.763760869128662, 7.404744787922358], 'bp': 1.0, 'sys_len': 60367, 'ref_len': 59985}


In [35]:
# Make predictions (Nucleus)
test_df = parse_data('test')
args.top_p = 0.5
args.top_k = 100

test_output = Dataset.from_pandas(test_df).map(
        lambda batch: {'generated': nucleus_search_sentences(
            batch,
            model,
            tokenizer,
            args,
            device='cuda:0')
        },
        batched=True,
        batch_size=1,
    )

# prepare evaluation data
ref_list, pred_list = prepare_eval(list(test_output))
reference_dict = {
    "language": "en",
    "values": ref_list,
}
prediction_dict = {
    "language": "en",
    "values": pred_list,
}

# Calculate BLEU-4.
metric = datasets.load_metric('sacrebleu')
fin_targets = [reference_dict['values'][x]['target'] for x in range(0,len(reference_dict['values']))]
fin_preds = [prediction_dict['values'][x]['generated'].split('\t')[0] for x in range(0,len(prediction_dict['values']))]
for model_predictions, gold_references in zip(fin_preds,fin_targets):
    metric.add(predictions=model_predictions, references=gold_references)
final_score = metric.compute()
print(final_score)

# Save pred/target lists.
with open('FINAL_NS_reference_dict_GPP256.json', 'w') as fp:
    json.dump(reference_dict, fp)
with open('FINAL_NS_prediction_dict_GPP256.json', 'w') as fp:
    json.dump(prediction_dict, fp)

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/5285 [00:00<?, ?ba/s]

{'score': 13.64246031118634, 'counts': [24541, 8930, 4546, 2397], 'totals': [56158, 50873, 45588, 40303], 'precisions': [43.699918088251, 17.553515617321565, 9.9719224357287, 5.947448080788031], 'bp': 0.9341231345067288, 'sys_len': 56158, 'ref_len': 59985}
