In [None]:
#hide
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pprint import pprint
import torch
# "coderpotter/T5-for-Adversarial-Paraphrasing"
# ok, few artifacts in the data, seemed not terrible, 850MB

#"shrishail/t5_paraphrase_msrp_paws"
# 250MB, seemed a bit weaker

# "ramsrigouthamg/t5_sentence_paraphraser"
# not bad, 850MB, seemed pretty reasonable for most things. 

# "prithivida/parrot_paraphraser_on_T5"
# you know what, i think this is one of the better ones I've seen.  850MB 

# "ceshine/t5-paraphrase-quora-paws"
# honestly not bad but its not as good as the parrot paraphraser. 

# "ramsrigouthamg/t5-large-paraphraser-diverse-high-quality"
# This model is huge - 2.75 GB. 
# But it has the best paraphrases, for sure. 

# Conclusion
# For testing, seeing what's good  - go with "prithivida/parrot_paraphraser_on_T5"
# For when you finally do it, go with "ramsrigouthamg/t5-large-paraphraser-diverse-high-quality"


tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5")

model = AutoModelForSeq2SeqLM.from_pretrained("prithivida/parrot_paraphraser_on_T5")

In [None]:
orig_l = [
 "this 72-minute film does have some exciting scenes , but it's a tad slow .",
 'a very average science fiction film .',
 "it doesn't matter that the film is less than 90 minutes . it still feels like a prison stretch .",
 'hardly a masterpiece , but it introduces viewers to a good charitable enterprise and some interesting real people .',
 'the good girl is a film in which the talent is undeniable but the results are underwhelming .',
 'the stories here suffer from the chosen format .',
 'lame sweet home leaves no southern stereotype unturned .',
 "funny , sexy , devastating and incurably romantic ."
]



device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


prefix = "paraphrase: "
text = [prefix + sen + " </s>" for sen in orig_l]
encoding = tokenizer(orig_l, padding=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

In [None]:
%%time
n_eval_seq = 48
model_output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_masks,
    do_sample=True,
   # num_beams = n_eval_seq ,
    min_length=0,
    max_length=48, 
    num_return_sequences=n_eval_seq,
    temperature=1.15,
    top_p =0.95 , 
#     num_beam_groups = n_eval_seq,
#     diversity_penalty=.,
    output_scores=True, 
    return_dict_in_generate=True
)
pp_l = tokenizer.batch_decode(model_output.sequences, skip_special_tokens=True)
pp_l_nested = [pp_l[i:i+n_eval_seq] for i in range(0, len(pp_l), n_eval_seq)]  # put paraphrases in nested lists 

# pprint(orig_l, width = 200)
# print()
# pprint(pp_l, width=200)

for orig, pp_l in zip(orig_l, pp_l_nested): 
    print(orig)
    pprint(pp_l, width=200)
    print(len(set(pp_l)))
    print(len(set(pp_l)) / n_eval_seq)
    print()
    print()
    
# outputs = []
# for output in model_output:
#     generated_sent = tokenizer.decode(
#         output, skip_special_tokens=True, clean_up_tokenization_spaces=True
#     )
#     if (
#         generated_sent.lower() != sentence.lower()
#         and generated_sent not in outputs
#     ):
#         outputs.append(generated_sent)
# return outputs


##### 

In [None]:
def get_paraphrases(sentence, prefix="paraphrase: ", n_predictions=5, top_k=120, max_length=256):
        text = prefix + sentence + " </s>"
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        encoding = tokenizer.encode_plus(
            text, pad_to_max_length=True, return_tensors="pt"
        )
        input_ids, attention_masks = encoding["input_ids"].to(device), encoding[
            "attention_mask"
        ].to(device)

        model_output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_masks,
            do_sample=True,
            max_length=max_length,
            top_k=top_k,
            top_p=0.98,
            early_stopping=True,
            num_return_sequences=n_predictions,
        )

        outputs = []
        for output in model_output:
            generated_sent = tokenizer.decode(
                output, skip_special_tokens=True, clean_up_tokenization_spaces=True
            )
            if (
                generated_sent.lower() != sentence.lower()
                and generated_sent not in outputs
            ):
                outputs.append(generated_sent)
        return outputs

paraphrases = get_paraphrases("The actors pull out all the stops in nearly every scene , but to diminishing effect . the characters never change.")



In [None]:
paraphrases

['The actors pull out all the stops in almost every scene but with diminishing effect the characters never change.',
 'In nearly every scene the actors put all the stops but in an unrelenting manner the characters never change.',
 "The actors make all the stops in nearly every scene but to diminishing effect. the characters never change. ''",
 'The actors pull out all the stops in almost every scene but the characters never change.',
 "The actors pull out nearly everything in almost every scene. but to a diminishing effect the characters never change. ''"]

In [None]:


from travis_attack.models import get_vm_probs, _prepare_vm_tokenizer_and_model
from travis_attack.config import Config

In [None]:
cfg = Config().adjust_config_for_rotten_tomatoes_dataset()
vm_tokenizer, vm_model = _prepare_vm_tokenizer_and_model(cfg)

In [None]:
cfg.vm_name

'textattack/distilbert-base-uncased-rotten-tomatoes'

In [None]:
text = ["the film is quiet, threatening and unforgettable.",
        "The film is quiet threatening and unforgettable is successful."
       ]
get_vm_probs(text, cfg, vm_tokenizer, vm_model, return_predclass=False)

tensor([[0.2675, 0.7325],
        [0.0937, 0.9063]], device='cuda:0')