### Setup 

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

In [2]:
## Imports and environment variables 
import os
os.environ["TOKENIZERS_PARALLELISM"]  = "true"  # set to false if not working

# Core imports 
import torch, numpy as np, pandas as pd, gc,sys, logging, warnings
from torch.utils.data import DataLoader, RandomSampler
from datasets import load_dataset, load_metric, load_from_disk, DatasetDict
from transformers import (AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, 
                          AutoTokenizer, AdamW, SchedulerType, get_scheduler)
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import pytorch_cos_sim
from collections import defaultdict
from accelerate import Accelerator, notebook_launcher
from cachetools import cached, LRUCache
from types import MethodType
from timeit import default_timer as timer
import utils; from utils import *   # local script 
from tqdm.auto import tqdm
import itertools
import copy 
import wandb
from undecorated import undecorated


# Dev imports (not needed for final script)
import seaborn as sns
from IPython.display import Markdown
from pprint import pprint
from IPython.core.debugger import set_trace
from GPUtil import showUtilization
import torchsnooper

In [4]:
# options for the pp_model 
# 1. tuner007/pegasus_paraphrase
# 2. tdopierre/ProtAugment-ParaphraseGenerator
# 3. eugenesiow/bart-paraphrase
pp_name = "eugenesiow/bart-paraphrase"
pp_tokenizer = AutoTokenizer.from_pretrained(pp_name)
# takes about 3GB memory space up on the GPU
# change the `local_files_only` argument if changing the model name 
pp_model = AutoModelForSeq2SeqLM.from_pretrained(pp_name, local_files_only=True)
# The no_grad version of generate
generate_with_grad = undecorated(pp_model.generate)
pp_model.generate_with_grad = MethodType(generate_with_grad, pp_model)

In [24]:
raw_text = [
    "Hello my name is Tom, nice to meet you!", 
    "The cat is brown and it looks cute."
]
tokens = pp_tokenizer(raw_text, return_tensors='pt', padding=True, pad_to_multiple_of=4)
pp_model_params = {
    "num_beams": 1, 
    "num_return_sequences": 1, 
    "num_beam_groups": 1, 
    "diversity_penalty": 0.,   # must be a float
    "temperature": 1.5,
    "length_penalty" : 1,
    "min_length" : 5
}
output = pp_model.generate_with_grad(**tokens, **pp_model_params, do_sample=False, 
                                      return_dict_in_generate=True,
                                      output_scores=True,
                                    remove_invalid_values=False)
output_text = pp_tokenizer.batch_decode(output.sequences, skip_special_tokens=True)

In [25]:
tokens

{'input_ids': tensor([[    0, 31414,   127,   766,    16,  1560,     6,  2579,     7,   972,
            47,   328,     2,     1,     1,     1],
        [    0,   133,  4758,    16,  6219,     8,    24,  1326, 11962,     4,
             2,     1,     1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}

Things to check 

* When does the model generate padding tokens? 
* Does p(PAD) =1 after an eos token?
* do rows always sum to 1 when looking at token generation scores? 
* Does first row sum to 0? (the one corresponding to start of sentence token)
*  how do you nan introduced into token scores?
* how do you get inf introduced into token scores?
* how big is the action space? 
  * initial estimate: on the order of vocab_size ^ sequence length (so bloody huge)
  * but only a very small proportion are valid actions
* how many pp has probs over: 1e-5, 1e-4, 1e-3, 1e-2, 1e-1. would be a good plot. x axis epoch, then ether do (a) for individual examples, or (b) as averages across examples
* log top X sampled sentences and their probs (maybe probs can be a graph)
* how does padding mask affect things?
* how does token-type-ids affect things? 
* which dist do you calculate KL divergence and entropy over? 
  * is it the token entropy’s at each generation step? 
  * is it the entropy of the generated paraphrase tokens? 
* given size of action space is this a good candidate for differential entropy? 
* when do you hit floating point threshold for token probabilities? When do nans and inf get introduced? 
* does using fp32 affect token calculations? 
* how does dropout affect generated probabilities? How does train/eval mode affect generated probs for a sentence? 
* how does layer-norm affect the probs? 