In [1]:
# Import block
import json
import argparse
import time
import os
import pandas as pd

from datasets import Dataset
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments
from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput

import torch
from torch import nn
from torch.nn import functional as F
from torch.autograd import Variable
from packaging import version

from datasets import list_datasets, load_dataset, list_metrics, load_metric
from datasets import Dataset

import sacrebleu
import datasets

2022-07-30 03:31:37.902484: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-07-30 03:31:37.906641: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-30 03:31:37.906655: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Utility functions from GP-VAE implementation

# Specific to dataset.
def construct_input_for_batch(tokenizer, batch, args):
    """
    Function that takes a batch from a dataset and constructs the corresponding 
    input string.
    """
    source, target = [], []
    for inp, out in zip(batch['source'], batch['target']):
        source.append(inp.strip())
        target.append(out.strip())
    if batch['id'][0] == 0:
        print(source[0])
        print(target[0])
        print()
    return source, target

def make_batch_inputs(batch, tokenizer, args, device='cuda:0'):
  """
  Function that takes a batch from a dataset and transforms it 
  """
  # Concatenate the concept names for each example in the batch.
  input_lists, _ = construct_input_for_batch(tokenizer, batch, args)
  # Use the model's tokenizer to create the batch input_ids.
  batch_features = tokenizer(input_lists, padding=True, return_tensors='pt')
  # Move all inputs to the device.
  batch_features = dict([(k, v.to(device)) for k, v in batch_features.items()])
  return batch_features

def make_batch_data(batch, tokenizer, args, device='cuda:0'):
  """
  Function that takes a batch from a dataset and transforms it 
  """
  # Concatenate the concept names for each example in the batch.
  input_lists, label_list = construct_input_for_batch(tokenizer, batch, args)
  # Use the model's tokenizer to create the batch input_ids.
  batch_features = tokenizer(input_lists, padding=True, return_tensors='pt')
  batch_labels = tokenizer(label_list, padding=True, return_tensors='pt')
  # Move all inputs to the device.
  batch_features = dict([(k, v.to(device)) for k, v in batch_features.items()])
  batch_labels = dict([(k, v.to(device)) for k, v in batch_labels.items()])
  return batch_features, batch_labels

def batch_tokenize(dataset_batch, tokenizer, args):
  """
  Reuse the function defined above to construct the batch (source, target) and 
  run them through the tokenizer.
  """
  source, target = construct_input_for_batch(tokenizer, dataset_batch, args)
  res = {
          "input_ids": tokenizer(
              source,
              padding='max_length', 
              truncation=True,
              max_length=args.encoder_max_length
          )["input_ids"],
          "labels": tokenizer(
              target,
              padding='max_length', 
              truncation=True,
              max_length=args.decoder_max_length
          )["input_ids"],
  }
  return res

def batchify_data(df, tokenizer, args):
  dataset = Dataset.from_pandas(df)
  data_tokenized = dataset.map(
    lambda batch: batch_tokenize(batch, tokenizer, args),
    batched=True
  )
  return data_tokenized

def compute_loss(batch, model, tokenizer, args):
  batch_feature, batch_label = make_batch_data(batch, tokenizer, args)
  with torch.no_grad():
    outputs = model(input_ids=batch_feature['input_ids'],
                    labels=batch_label['input_ids'])
    eval_loss = outputs.loss.item()
  return [eval_loss] 

def test_ppl(val_df, model, tokenizer, args):
  loss_dict = Dataset.from_pandas(val_df).map(
    lambda batch: {'loss': compute_loss(batch, model, tokenizer, args)},
    batched=True,
    batch_size=1,
  )
  
  eval_loss = 0.
  nb_eval_steps = 0
  for item in list(loss_dict):
      eval_loss += item['loss']
      nb_eval_steps += 1
  eval_loss = eval_loss / nb_eval_steps
  ppl = torch.exp(torch.tensor(eval_loss))
  return ppl.item()

def prepare_eval(output_list):
    ref_list, pred_list = [], []
    for item in output_list:
        pred_list.append({"generated": item['generated']})
        ref_list.append({"target": [item['target']]})
    return ref_list, pred_list

In [6]:
# Replacing dataset constructing function from utilities with a custom one.
def parse_data(t_split='train'):

  # Split handling - validation set further split into 50% dev/test.
  if t_split == 'train':
    df = pd.DataFrame(load_dataset('squad')['train'])
  elif t_split in ['val','test']:
    vt_df = pd.DataFrame(load_dataset('squad')['validation'])
    df_val = vt_df.sample(frac=0.5,random_state=266)
    if t_split == 'test':
      df_test = vt_df.drop(df_val.index)
      df = df_test
    else:
      df = df_val
  else:
    raise Exception("Invalid choice of dataset split.")
  

  df['answer_text'] = df['answers'].apply(lambda x: x['text'][0])
  df['source'] = 'answer: ' + df['answer_text'] + ' context: ' + df['context'] + '</s>'
  df['target'] = df['question']

  return df                                                                                                                       

In [3]:
if version.parse(torch.__version__) < version.parse("1.6"):
    from transformers.file_utils import is_apex_available

    if is_apex_available():
        from apex import amp
    _use_apex = True
else:
    _use_native_amp = True
    from torch.cuda.amp import autocast

class Seq2SeqTrainer(Trainer):
    """Class to finetune a Seq2Seq model."""

    def __init__(
            self,
            num_beams=4,
            max_length=32,
            *args, **kwargs
    ):
        super().__init__(*args, **kwargs)
        self.num_beams = num_beams
        self.max_length = max_length

    def compute_loss(self, model, inputs):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        outputs = model(input_ids=inputs['input_ids'],
                        # decoder_input_ids=inputs['labels'][:,:-1],
                        labels=inputs['labels'])
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if self.label_smoother is not None and "labels" in inputs:
            return self.label_smoother(outputs, inputs["labels"])
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            return outputs["loss"] if isinstance(outputs, dict) else outputs[0]

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        """
        Runs the model to either generate a sequence and/or compute the loss.
        """
        has_labels = all(inputs.get(k) is not None for k in self.label_names)
        inputs = self._prepare_inputs(inputs)
        # Compute loss with labels first.
        with torch.no_grad():
            if self.args.fp16 and _use_native_amp:
                with autocast():
                    outputs = model(input_ids=inputs['input_ids'],
                                    # decoder_input_ids=inputs['labels'][:,:-1],
                                    labels=inputs['labels'])
            else:
                outputs = model(input_ids=inputs['input_ids'],
                                # decoder_input_ids=inputs['labels'][:,:-1],
                                labels=inputs['labels'])
            if has_labels:
                loss = outputs[0].mean().detach()
            else:
                loss = None
        # If we're only computing the conditional log-likelihood, return.
        if prediction_loss_only:
            return (loss, None, None)
        # Otherwise run model.generate() to get predictions.
        if isinstance(model, torch.nn.DataParallel):
            preds = model.module.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                num_beams=self.num_beams,
                max_length=self.max_length,
            )
        else:
            preds = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                num_beams=self.num_beams,
                max_length=self.max_length,
            )
        if len(preds) == 1:
            preds = preds[0]
        # Pad predictions if necessary so they can be concatenated across batches.
        if preds.shape[-1] < self.max_length:
            preds = torch.nn.functional.pad(
                preds, (0, self.max_length - preds.shape[-1]),
                mode='constant',
                value=self.tokenizer.pad_token_id
            )
        # Post-process labels.
        if has_labels:
            labels = inputs.get('labels')
        else:
            labels = None
        return (loss, preds, labels)


def train(args):
    # Load the dataset
    trn_df = parse_data('train')
    val_df = parse_data('val')

    # Load the pre-trained model
    ckpt_path = None
    if args.task == 'train':
        ckpt_path = args.model_name
    else:
        ckpt_path = f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}/checkpoint-{args.ckpt}"
        # update timestamp and create new path for ckpt
        args.timestamp = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())

    tokenizer = T5TokenizerFast.from_pretrained(ckpt_path)
    print(f"Vocab size: {len(tokenizer)}")

    train_data_tokenized = batchify_data(trn_df, tokenizer, args)
    valid_data_tokenized = batchify_data(val_df, tokenizer, args)

    model = T5ForConditionalGeneration.from_pretrained(ckpt_path)
    model = model.to('cuda:0')

    # Training Setup
    train_args = TrainingArguments(
        output_dir=f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}",
        do_train=True,
        do_eval=True,
        save_strategy="steps",
        save_steps=300,
        evaluation_strategy="steps",
        eval_steps=300,
        logging_steps=100,
        # optimization args, the trainer uses the Adam optimizer
        # and has a linear warmup for the learning rate
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        gradient_accumulation_steps=1,
        learning_rate=1e-04,
        num_train_epochs=args.epochs,
        warmup_steps=0,
        lr_scheduler_type='constant',
        # misc args
        seed=42,
        save_total_limit=5,  # limit the total amount of checkpoints
        disable_tqdm=False,
        metric_for_best_model="eval_loss",
        load_best_model_at_end=True,
        greater_is_better=False,
        local_rank=args.local_rank
    )

    trainer = Seq2SeqTrainer(
        num_beams=args.beam_size,
        max_length=args.decoder_max_length,
        model=model,
        args=train_args,
        train_dataset=train_data_tokenized,
        eval_dataset=valid_data_tokenized,
        tokenizer=tokenizer,
    )

    # Now that we have the trainer set up, we can finetune.
    trainer.train()


def beam_generate_sentences(batch,
                            model,
                            tokenizer,
                            args,
                            device='cuda:0'):
    # Create batch inputs.
    features = make_batch_inputs(
        batch=batch,
        tokenizer=tokenizer,
        args=args,
        device=device)
    # Generate with beam search.
    generated_ids = model.generate(
        input_ids=features['input_ids'],
        attention_mask=features['attention_mask'],
        num_beams=args.beam_size,
        max_length=args.max_generation_length,
        num_return_sequences=args.num_return_sequences,
    )
    # Use model tokenizer to decode to text.
    generated_sentences = [
        tokenizer.decode(gen_ids.tolist(), skip_special_tokens=True)
        for gen_ids in generated_ids
    ]
    # print(generated_sentences)
    return ['\t'.join(generated_sentences)]

def nucleus_search_sentences(batch,
                            model,
                            tokenizer,
                            args,
                            device='cuda:0'):
    # Create batch inputs.
    features = make_batch_inputs(
        batch=batch,
        tokenizer=tokenizer,
        args=args,
        device=device)
    # Generate with nucleus search.
    generated_ids = model.generate(
        input_ids=features['input_ids'],
        attention_mask=features['attention_mask'],
        do_sample=True, 
        max_length=args.max_generation_length,
        top_p=args.top_p, 
        top_k=args.top_k,
        num_return_sequences=args.num_return_sequences
    )
    # Use model tokenizer to decode to text.
    generated_sentences = [
        tokenizer.decode(gen_ids.tolist(), skip_special_tokens=True)
        for gen_ids in generated_ids
    ]
    #print(generated_sentences)
    return ['\t'.join(generated_sentences)]

def sample_sentences(batch,
                     model,
                     tokenizer,
                     args,
                     device='cuda:0'):
    # Create batch inputs.
    features = make_batch_inputs(
        batch=batch,
        tokenizer=tokenizer,
        args=args,
        device=device)

    generated_sentences = []
    for i in range(args.num_return_sequences):
        # Generate with beam search.
        generated_ids = model.generate(
            input_ids=features['input_ids'],
            attention_mask=features['attention_mask'],
            num_beams=args.beam_size,
            max_length=args.max_generation_length,
            num_return_sequences=1,
        )
        # Use model tokenizer to decode to text.
        generated_sentences += [
            tokenizer.decode(gen_ids.tolist(), skip_special_tokens=True)
            for gen_ids in generated_ids
        ]
    # print(generated_sentences)
    return ['\t'.join(generated_sentences)]


def test(args):
    te_df = parse_data('test')
    print('Data loaded!!!')

    # Load the model
    if args.timestamp == '0':
        tokenizer = T5TokenizerFast.from_pretrained(f"{args.model_name}")
    else:
        ckpt_path = f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}/checkpoint-{args.ckpt}"
        tokenizer = T5TokenizerFast.from_pretrained(ckpt_path)
    print(f"Vocab size: {len(tokenizer)}")

    if args.timestamp == '0':
        model = T5ForConditionalGeneration.from_pretrained(f"{args.model_name}")
    else:
        ckpt_path = f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}/checkpoint-{args.ckpt}"
        model = T5ForConditionalGeneration.from_pretrained(ckpt_path)
    model = model.to('cuda:0')
    model.kernel_v = args.kernel_v
    model.kernel_r = args.kernel_r
    model.from_mean = args.from_mean
    model.scaler = args.scaler

    # Make predictions
    if args.from_mean:
        test_output = Dataset.from_pandas(te_df).map(
            lambda batch: {'generated': beam_generate_sentences(
                batch,
                model,
                tokenizer,
                args,
                device='cuda:0')
            },
            batched=True,
            batch_size=1,
        )
    else:
        test_output = Dataset.from_pandas(te_df).map(
            lambda batch: {'generated': sample_sentences(
                batch,
                model,
                tokenizer,
                args,
                device='cuda:0')
            },
            batched=True,
            batch_size=1,
        )

    # prepare evaluation data
    ref_list, pred_list = prepare_eval(list(test_output))
    reference_dict = {
        "language": "en",
        "values": ref_list,
    }
    prediction_dict = {
        "language": "en",
        "values": pred_list,
    }

    if args.timestamp == '0':
        os.makedirs(f"{args.model_name}_{args.dataset}_{args.flag}_{args.timestamp}")

    with open(
            f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}/refs.json",
            'w') as f:
        f.write(json.dumps(reference_dict, indent=2))
    if args.from_mean:
        with open(
                f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}/outs_mean.json",
                'w') as f:
            f.write(json.dumps(prediction_dict, indent=2))
    else:
        with open(
                f"{args.model_name}_{args.dataset}_{args.flag}_{args.kernel_v}_{args.kernel_r}_{args.timestamp}/outs.json",
                'w') as f:
            f.write(json.dumps(prediction_dict, indent=2))

In [4]:
p = argparse.ArgumentParser(description='Hyperparams')
p.add_argument('-t', '--task', type=str, default="train",
                help="specify the task to do: (train)ing, ft(finetune), (eval)uation")
p.add_argument('-c', '--ckpt', type=str, default="193280",
                help="Model checkpoint")
p.add_argument('-time', '--timestamp', type=str, default='2021-02-14-04-57-04',
                help="Model checkpoint")
p.add_argument('-f', '--flag', type=str, default='gpvae',
                help="Model checkpoint")
p.add_argument('-d', '--dataset', type=str, default="GYAFC/em",
                help="specify the dataset: GYAFC/em, GYAFC/fr")
p.add_argument('--model_name', type=str, default="t5-base",
                help="specify the model name: t5-base, facebook/blenderbot-400M-distill")
p.add_argument('-v', '--kernel_v', type=float, default=64.0,
                help="Hyper-parameter for prior kernel,  control the signal variance")
p.add_argument('-r', '--kernel_r', type=float, default=0.0001,
                help="Hyper-parameter for prior kernel.")
p.add_argument('-s', '--scaler', type=float, default=1.0)
p.add_argument('--from_mean', action='store_true',
                help="specify whether sample from mean during generation")
p.add_argument('-bz', '--batch_size', type=int, default=8)
p.add_argument('-e', '--epochs', type=int, default=10)
p.add_argument('--encoder_max_length', type=int, default=256)
p.add_argument('--decoder_max_length', type=int, default=48)
p.add_argument('--max_generation_length', type=int, default=96)
p.add_argument('--beam_size', type=int, default=5)
p.add_argument('--num_return_sequences', type=int, default=5)
p.add_argument('--local_rank', type=int, default=-1,
                help="Multiple GPU training")
args = p.parse_args()

# jupyter fix for bad flag
args.flag = 't5base'

### Generate predictions on validation set.

In [7]:
# Get tokenizer, model, and dev set.
#ckpt_path = f"t5-base_GYAFC/em_gpvae_64.0_0.0001_2022-07-12-02-30-44/checkpoint-10800"
#ckpt_path = f"t5-base_GYAFC/em_t5gpp128enc_64.0_0.0001_2022-07-16-14-35-44/checkpoint-4500"
ckpt_path = "t5-base_SQuAD_t5baseline256enc_64.0_0.0001_2022-07-14-16-30-47/checkpoint-2700"
#ckpt_path = "t5-base_SQuAD_t5baseline512enc_64.0_0.0001_2022-07-14-22-30-45/checkpoint-2700"
#ckpt_path = "t5-base_SQuAD_t5baseline128enc_64.0_0.0001_2022-07-14-20-27-33/checkpoint-2700"
#ckpt_path = "t5-base_SQuAD_t5baseline64enc_64.0_0.0001_2022-07-14-15-23-46/checkpoint-1200"

tokenizer = T5TokenizerFast.from_pretrained(ckpt_path)
model = T5ForConditionalGeneration.from_pretrained(ckpt_path)
val_df = parse_data('val')

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
# Other steps
model = model.to('cuda:0')
model.kernel_v = args.kernel_v
model.kernel_r = args.kernel_r
model.from_mean = args.from_mean
model.scaler = args.scaler

In [9]:
# Make predictions
test_output = Dataset.from_pandas(val_df).map(
    lambda batch: {'generated': beam_generate_sentences(
        batch,
        model,
        tokenizer,
        args,
        device='cuda:0')
    },
    batched=True,
    batch_size=1,
)

# prepare evaluation data
ref_list, pred_list = prepare_eval(list(test_output))
reference_dict = {
    "language": "en",
    "values": ref_list,
}
prediction_dict = {
    "language": "en",
    "values": pred_list,
}



Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

### Score predictions

In [3]:
# Calculate BLEU-4.
metric = datasets.load_metric('sacrebleu')
fin_targets = [reference_dict['values'][x]['target'] for x in range(0,len(reference_dict['values']))]
fin_preds = [prediction_dict['values'][x]['generated'].split('\t')[0] for x in range(0,len(prediction_dict['values']))]
for model_predictions, gold_references in zip(fin_preds,fin_targets):
    metric.add(predictions=model_predictions, references=gold_references)
final_score = metric.compute()
final_score

{'score': 21.832873266861107,
 'counts': [29570, 13727, 8038, 4922],
 'totals': [57892, 52607, 47322, 42037],
 'precisions': [51.077869135631865,
  26.093485657802194,
  16.98575715312117,
  11.708732783024479],
 'bp': 0.9622124576388367,
 'sys_len': 57892,
 'ref_len': 60122}

### Save or Load Predictions

In [26]:
# Save pred/target lists.
with open('reference_dict_base256.json', 'w') as fp:
    json.dump(reference_dict, fp)
with open('prediction_dict_base256.json', 'w') as fp:
    json.dump(prediction_dict, fp)

In [2]:
# Open pred/target lists.
with open('reference_dict_base256.json', 'r') as fp:
    reference_dict = json.load(fp)
with open('prediction_dict_base256.json', 'r') as fp:
    prediction_dict = json.load(fp)

### Beam experimentation

In [16]:
# Make predictions
test_output = Dataset.from_pandas(val_df[0:5]).map(
        lambda batch: {'generated': beam_generate_sentences(
            batch,
            model,
            tokenizer,
            args,
            device='cuda:0')
        },
        batched=True,
        batch_size=1,
    )

# prepare evaluation data
ref_list, pred_list = prepare_eval(list(test_output))
reference_dict = {
    "language": "en",
    "values": ref_list,
}
prediction_dict = {
    "language": "en",
    "values": pred_list,
}

  0%|          | 0/5 [00:00<?, ?ba/s]

In [28]:
val_df[0:5].source

9318     answer: 1421 context: Before the St. Elizabeth...
10432    answer: applied force context: Pushing against...
3106     answer: Huguon context: In this last connectio...
5685     answer: Serge Chermayeff context: One of the e...
7684     answer: the Museum of the Moving Image context...
Name: source, dtype: object

In [30]:
reference_dict['values']

[{'target': ['What year did the flood that impacted the Meuse take place?']},
 {'target': ['What makes static friction go up or down in responce to contact characteristics between an object and the surface it is on?']},
 {'target': ['By what other name was the Gate known?']},
 {'target': ['A rug by which Russian-born British designer is included in the V&A collection?']},
 {'target': ['Who put on a Doctor Who exhibition in 1991?']}]

In [31]:
prediction_dict['values'][4]['generated'].split('\t')

['Who named their exhibition "Behind the Sofa"?',
 'Who named their exhibition "Behind the Sofa" in 1991?',
 'Which museum in London named their exhibition "Behind the Sofa"?',
 "Who named their exhibition 'Behind the Sofa'?",
 'Who named the exhibition "Behind the Sofa"?']

### Generate and save predictions for multiple sets

In [18]:
# Settings
paths = ["t5-base_SQuAD_t5baseline64enc_64.0_0.0001_2022-07-14-15-23-46/checkpoint-1200",
"t5-base_SQuAD_t5baseline128enc_64.0_0.0001_2022-07-14-20-27-33/checkpoint-2700",         
"t5-base_SQuAD_t5baseline256enc_64.0_0.0001_2022-07-14-16-30-47/checkpoint-2700",
"t5-base_SQuAD_t5baseline512enc_64.0_0.0001_2022-07-14-22-30-45/checkpoint-2700"]
names = ['nucleus64', 'nucleus128', 'nucleus256', 'nucleus512']

# Get data
val_df = parse_data('val')

# Main loop
for x in range (len(names)):
    tokenizer = T5TokenizerFast.from_pretrained(paths[x])
    model = T5ForConditionalGeneration.from_pretrained(paths[x])
    model = model.to('cuda:0')
    model.kernel_v = args.kernel_v
    model.kernel_r = args.kernel_r
    model.from_mean = True
    model.scaler = args.scaler
    
    # Make predictions
    test_output = Dataset.from_pandas(val_df).map(
        lambda batch: {'generated': beam_generate_sentences(
            batch,
            model,
            tokenizer,
            args,
            device='cuda:0')
        },
        batched=True,
        batch_size=1,
    )

    # prepare evaluation data
    ref_list, pred_list = prepare_eval(list(test_output))
    reference_dict = {
        "language": "en",
        "values": ref_list,
    }
    prediction_dict = {
        "language": "en",
        "values": pred_list,
    }
    
    # Save pred/target lists.
    with open(names[x] + '_reference_dict.json', 'w') as fp:
        json.dump(reference_dict, fp)
    with open(names[x] + '_prediction_dict.json', 'w') as fp:
        json.dump(prediction_dict, fp)
        
    # Calculate BLEU-4.
    metric = datasets.load_metric('sacrebleu')
    fin_targets = [reference_dict['values'][x]['target'] for x in range(0,len(reference_dict['values']))]
    fin_preds = [prediction_dict['values'][x]['generated'].split('\t')[0] for x in range(0,len(prediction_dict['values']))]
    for model_predictions, gold_references in zip(fin_preds,fin_targets):
        metric.add(predictions=model_predictions, references=gold_references)
    final_score = metric.compute()
    print(names[x], ' | ', final_score)
    print()

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


beam64  |  {'score': 20.613766611394976, 'counts': [30098, 13600, 7839, 4697], 'totals': [62003, 56718, 51433, 46148], 'precisions': [48.54281244455913, 23.97827850065235, 15.241187564404177, 10.178122562191211], 'bp': 1.0, 'sys_len': 62003, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


beam128  |  {'score': 21.832873266861107, 'counts': [29570, 13727, 8038, 4922], 'totals': [57892, 52607, 47322, 42037], 'precisions': [51.077869135631865, 26.093485657802194, 16.98575715312117, 11.708732783024479], 'bp': 0.9622124576388367, 'sys_len': 57892, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


beam256  |  {'score': 22.156978729604877, 'counts': [29891, 13929, 8212, 5059], 'totals': [59419, 54134, 48849, 43564], 'precisions': [50.3054578501826, 25.730594450807256, 16.810988965997257, 11.61279955926912], 'bp': 0.9882384813920382, 'sys_len': 59419, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


beam512  |  {'score': 22.027947806309555, 'counts': [30214, 13941, 8118, 4958], 'totals': [59726, 54441, 49156, 43871], 'precisions': [50.58768375581823, 25.60753843610514, 16.51476930588331, 11.301315219621163], 'bp': 0.9933916535261289, 'sys_len': 59726, 'ref_len': 60122}



IndexError: list index out of range

In [30]:
# Settings
tokenizer = T5TokenizerFast.from_pretrained('t5-base_SQuAD_t5baseline256enc_64.0_0.0001_2022-07-14-16-30-47/checkpoint-2700')
model = T5ForConditionalGeneration.from_pretrained('t5-base_SQuAD_t5baseline256enc_64.0_0.0001_2022-07-14-16-30-47/checkpoint-2700')
model = model.to('cuda:0')
model.kernel_v = args.kernel_v
model.kernel_r = args.kernel_r
model.from_mean = True
model.scaler = args.scaler

pk_list = [(0.5,3),(0.5,10),(0.5,100)]

# Get data
val_df = parse_data('val')

# Main loop
for x in range (len(pk_list)):
    args.top_p = pk_list[x][0]
    args.top_k = pk_list[x][1]
    
    # Make predictions
    test_output = Dataset.from_pandas(val_df).map(
        lambda batch: {'generated': nucleus_search_sentences(
            batch,
            model,
            tokenizer,
            args,
            device='cuda:0')
        },
        batched=True,
        batch_size=1,
    )

    # prepare evaluation data
    ref_list, pred_list = prepare_eval(list(test_output))
    reference_dict = {
        "language": "en",
        "values": ref_list,
    }
    prediction_dict = {
        "language": "en",
        "values": pred_list,
    }
    
        
    # Calculate BLEU-4.
    metric = datasets.load_metric('sacrebleu')
    fin_targets = [reference_dict['values'][x]['target'] for x in range(0,len(reference_dict['values']))]
    fin_preds = [prediction_dict['values'][x]['generated'].split('\t')[0] for x in range(0,len(prediction_dict['values']))]
    for model_predictions, gold_references in zip(fin_preds,fin_targets):
        metric.add(predictions=model_predictions, references=gold_references)
    final_score = metric.compute()
    print('top p =', args.top_p,'top k =', args.top_k ,' | ', final_score)
    print()

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/5285 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors


top p = 0.5 top k = 10  |  {'score': 19.119924463130687, 'counts': [27786, 12089, 6797, 3988], 'totals': [52696, 47411, 42126, 36841], 'precisions': [52.72885987551237, 25.498302081795362, 16.13492854769026, 10.824896175456692], 'bp': 0.8685574803018126, 'sys_len': 52696, 'ref_len': 60122}



  0%|          | 0/5285 [00:00<?, ?ba/s]

top p = 0.5 top k = 100  |  {'score': 18.705496682099312, 'counts': [27562, 11870, 6636, 3847], 'totals': [52986, 47701, 42416, 37131], 'precisions': [52.01751406031782, 24.8841743359678, 15.645039607695209, 10.360615119441976], 'bp': 0.8739980936452628, 'sys_len': 52986, 'ref_len': 60122}



### Model Structure

In [61]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

### Test on novel examples

In [46]:
# Standardize beam size and return sequences.
args.beam_size = 10
args.num_return_sequences = 10

# Example A - Superhero Question
novel_answer = "Batman"
novel_context = "The Justice League is made up of many heroes. Superman is an alien with super strength, x-ray vision, and the ability to fly. Batman uses his vast wealth to buy gadgets. Wonder Woman is an Amazon princess with an invisible jet, bulletproof bracelets, and a lasso of truth. The Flash is just really fast."
novel_df = pd.DataFrame()
novel_df['source'] = ["answer: " + novel_answer + " context: " + novel_context]
novel_df['target'] = ['Who is the richest member of the Justice League?']
novel_df['id'] = 0


novel_example_output = Dataset.from_pandas(novel_df).map(
    lambda batch: {'generated': beam_generate_sentences(
        batch,
        model,
        tokenizer,
        args,
        device='cuda:0')
    },
    batched=True,
    batch_size=1,
)

novel_example_output

# Prepare evaluation data
novel_refs_list, novel_preds_list = prepare_eval(list(novel_example_output))

# Show output
novel_preds_list[0]['generated'].split('\t')

  0%|          | 0/1 [00:00<?, ?ba/s]

answer: Batman context: The Justice League is made up of many heroes. Superman is an alien with super strength, x-ray vision, and the ability to fly. Batman uses his vast wealth to buy gadgets. Wonder Woman is an Amazon princess with an invisible jet, bulletproof bracelets, and a lasso of truth. The Flash is just really fast.
Who is the richest member of the Justice League?



['Who uses his wealth to buy gadgets?',
 'Superman is an alien with super strength, x-ray vision, and the ability to fly. Wonder Woman is an Amazon princess with an invisible jet, bulletproof bracelets, and a lasso of truth?',
 'Superman is an alien with super strength, x-ray vision, and the ability to fly. Wonder Woman is an Amazon princess with an invisible jet, bulletproof bracelets, and a lasso of truth.',
 'Superman is an alien with super strength, x-ray vision, and the ability to fly, and Wonder Woman is an Amazon princess with an invisible jet, bulletproof bracelets, and a lasso of truth?',
 'Superman is an alien with super strength, x-ray vision, and the ability to fly. Wonder Woman is an Amazon princess with an invisible jet, bulletproof bracelets, and a lasso of truth.',
 'Superman is an alien with super strength, x-ray vision, and the ability to fly. Wonder Woman is an Amazon princess with an invisible jet, bulletproof bracelets, and a lasso of truth?',
 'Superman is an alie

In [38]:
# Standardize beam size and return sequences.
args.beam_size = 10
args.num_return_sequences = 10

# Example B - Machine Learning
novel_answer = "Stochastic gradient descent"
novel_context = "Stochastic gradient descent (often abbreviated SGD) is an iterative method for optimizing an objective function with suitable smoothness properties (e.g. differentiable or subdifferentiable). It can be regarded as a stochastic approximation of gradient descent optimization, since it replaces the actual gradient (calculated from the entire data set) by an estimate thereof (calculated from a randomly selected subset of the data). Especially in high-dimensional optimization problems this reduces the very high computational burden, achieving faster iterations in trade for a lower convergence rate."
novel_df = pd.DataFrame()
novel_df['source'] = ["answer: " + novel_answer + " context: " + novel_context]
novel_df['target'] = ['What does SGD stand for?']
novel_df['id'] = 0


novel_example_output = Dataset.from_pandas(novel_df).map(
    lambda batch: {'generated': beam_generate_sentences(
        batch,
        model,
        tokenizer,
        args,
        device='cuda:0')
    },
    batched=True,
    batch_size=1,
)

novel_example_output

# Prepare evaluation data
novel_refs_list, novel_preds_list = prepare_eval(list(novel_example_output))

# Show output
novel_preds_list[0]['generated'].split('\t')

  0%|          | 0/1 [00:00<?, ?ba/s]

answer: Stochastic gradient descent context: Stochastic gradient descent (often abbreviated SGD) is an iterative method for optimizing an objective function with suitable smoothness properties (e.g. differentiable or subdifferentiable). It can be regarded as a stochastic approximation of gradient descent optimization, since it replaces the actual gradient (calculated from the entire data set) by an estimate thereof (calculated from a randomly selected subset of the data). Especially in high-dimensional optimization problems this reduces the very high computational burden, achieving faster iterations in trade for a lower convergence rate.
What does SGD stand for?



['What is an iterative method for optimizing an objective function with suitable smoothness properties?',
 'What is an iterative method for optimizing an objective function?',
 'What is an iterative method for optimizing an objective function with smoothness properties?',
 'SGD is an iterative method for optimizing an objective function with suitable smoothness properties?',
 'What is the iterative method for optimizing an objective function with suitable smoothness properties?',
 'What is a iterative method for optimizing an objective function with suitable smoothness properties?',
 'What is iterative method for optimizing an objective function with suitable smoothness properties?',
 'What is an iterative method for optimizing an objective function with suitable smoothness properties called?',
 'What is an iterative method of optimizing an objective function with suitable smoothness properties?',
 'What is the iterative method for optimizing an objective function with smoothness prope

In [39]:
# Standardize beam size and return sequences.
args.beam_size = 10
args.num_return_sequences = 10

# Example C - World History
novel_answer = "1066"
novel_context = "William I, usually known as William the Conqueror and sometimes William the Bastard, was the first Norman king of England, reigning from 1066 until his death in 1087. A descendant of Rollo, he was Duke of Normandy from 1035 onward. By 1060, following a long struggle to establish his throne, his hold on Normandy was secure. In 1066, following the death of Edward the Confessor, William invaded England, leading an army of Normans to victory over the Anglo-Saxon forces of Harold Godwinson at the Battle of Hastings, and suppressed subsequent English revolts in what has become known as the Norman Conquest. The rest of his life was marked by struggles to consolidate his hold over England and his continental lands, and by difficulties with his eldest son, Robert Curthose."
novel_df = pd.DataFrame()
novel_df['source'] = ["answer: " + novel_answer + " context: " + novel_context]
novel_df['target'] = ['At which battle did William the Conqueror defeat Harold Godwinson?']
novel_df['id'] = 0


novel_example_output = Dataset.from_pandas(novel_df).map(
    lambda batch: {'generated': beam_generate_sentences(
        batch,
        model,
        tokenizer,
        args,
        device='cuda:0')
    },
    batched=True,
    batch_size=1,
)

novel_example_output

# Prepare evaluation data
novel_refs_list, novel_preds_list = prepare_eval(list(novel_example_output))

# Show output
novel_preds_list[0]['generated'].split('\t')

  0%|          | 0/1 [00:00<?, ?ba/s]

answer: 1066 context: William I, usually known as William the Conqueror and sometimes William the Bastard, was the first Norman king of England, reigning from 1066 until his death in 1087. A descendant of Rollo, he was Duke of Normandy from 1035 onward. By 1060, following a long struggle to establish his throne, his hold on Normandy was secure. In 1066, following the death of Edward the Confessor, William invaded England, leading an army of Normans to victory over the Anglo-Saxon forces of Harold Godwinson at the Battle of Hastings, and suppressed subsequent English revolts in what has become known as the Norman Conquest. The rest of his life was marked by struggles to consolidate his hold over England and his continental lands, and by difficulties with his eldest son, Robert Curthose.
At which battle did William the Conqueror defeat Harold Godwinson?



['When did William the Conqueror reign?',
 'When did William I reign?',
 'In what year did William the Conqueror reign?',
 'In what year did William I reign?',
 'When did William I invade England?',
 'When was William the Conqueror king of England?',
 'In what year was William I the first Norman king of England?',
 'When did William the Conqueror reign in England?',
 'When did William I reign in England?',
 'When was William the Conqueror born?']

### Final predictions on test set

In [11]:
# Make predictions (Beam)
test_df = parse_data('test')

test_output = Dataset.from_pandas(test_df).map(
    lambda batch: {'generated': beam_generate_sentences(
        batch,
        model,
        tokenizer,
        args,
        device='cuda:0')
    },
    batched=True,
    batch_size=1,
)

# prepare evaluation data
ref_list, pred_list = prepare_eval(list(test_output))
reference_dict = {
    "language": "en",
    "values": ref_list,
}
prediction_dict = {
    "language": "en",
    "values": pred_list,
}

# Calculate BLEU-4.
metric = datasets.load_metric('sacrebleu')
fin_targets = [reference_dict['values'][x]['target'] for x in range(0,len(reference_dict['values']))]
fin_preds = [prediction_dict['values'][x]['generated'].split('\t')[0] for x in range(0,len(prediction_dict['values']))]
for model_predictions, gold_references in zip(fin_preds,fin_targets):
    metric.add(predictions=model_predictions, references=gold_references)
final_score = metric.compute()
print(final_score)

# Save pred/target lists.
with open('FINAL_reference_dict_base256.json', 'w') as fp:
    json.dump(reference_dict, fp)
with open('FINAL_prediction_dict_base256.json', 'w') as fp:
    json.dump(prediction_dict, fp)

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/5285 [00:00<?, ?ba/s]

{'score': 21.127372392801167, 'counts': [29520, 13511, 7729, 4540], 'totals': [58232, 52947, 47662, 42377], 'precisions': [50.69377661766726, 25.51797080098967, 16.216272921824515, 10.713358661538098], 'bp': 0.9703448812087755, 'sys_len': 58232, 'ref_len': 59985}


In [13]:
# Make predictions (Nucleus)
test_df = parse_data('test')
args.top_p = 0.5
args.top_k = 100

test_output = Dataset.from_pandas(test_df).map(
        lambda batch: {'generated': nucleus_search_sentences(
            batch,
            model,
            tokenizer,
            args,
            device='cuda:0')
        },
        batched=True,
        batch_size=1,
    )

# prepare evaluation data
ref_list, pred_list = prepare_eval(list(test_output))
reference_dict = {
    "language": "en",
    "values": ref_list,
}
prediction_dict = {
    "language": "en",
    "values": pred_list,
}

# Calculate BLEU-4.
metric = datasets.load_metric('sacrebleu')
fin_targets = [reference_dict['values'][x]['target'] for x in range(0,len(reference_dict['values']))]
fin_preds = [prediction_dict['values'][x]['generated'].split('\t')[0] for x in range(0,len(prediction_dict['values']))]
for model_predictions, gold_references in zip(fin_preds,fin_targets):
    metric.add(predictions=model_predictions, references=gold_references)
final_score = metric.compute()
print(final_score)

# Save pred/target lists.
with open('FINAL_NS_reference_dict_base256.json', 'w') as fp:
    json.dump(reference_dict, fp)
with open('FINAL_NS_prediction_dict_base256.json', 'w') as fp:
    json.dump(prediction_dict, fp)

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/5285 [00:00<?, ?ba/s]

{'score': 18.25875662855481, 'counts': [27420, 11684, 6420, 3649], 'totals': [53015, 47730, 42445, 37160], 'precisions': [51.721210978025084, 24.479363084014246, 15.125456473082814, 9.819698600645856], 'bp': 0.876803628158875, 'sys_len': 53015, 'ref_len': 59985}
