In [1]:
import pandas as pd
from parrot import Parrot
import torch
import warnings

In [3]:
# load datset

data = pd.read_csv("/home/robin/Code_repo/psycholinguistic2125/paraphrase_detection/data/ROCStories/ROCStories_winter2017.csv")
print("length of data: ", len(data))
data.head()

length of data:  52665


Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,David noticed he had put on a lot of weight re...,He examined his habits to try and figure out t...,He realized he'd been eating too much fast foo...,He stopped going to burger places and started ...,"After a few weeks, he started to feel much bet..."
1,0beabab2-fb49-460e-a6e6-f35a202e3348,Frustration,Tom had a very short temper.,One day a guest made him very angry.,He punched a hole in the wall of his house.,Tom's guest became afraid and left quickly.,Tom sat on his couch filled with regret about ...
2,87da1a22-df0b-410c-b186-439700b70ba6,Marcus Buys Khakis,Marcus needed clothing for a business casual e...,All of his clothes were either too formal or t...,He decided to buy a pair of khakis.,The pair he bought fit him perfectly.,Marcus was happy to have the right clothes for...
3,2d16bcd6-692a-4fc0-8e7c-4a6f81d9efa9,Different Opinions,Bobby thought Bill should buy a trailer and ha...,Bill thought a truck would be better for what ...,Bobby pointed out two vehicles were much more ...,Bill was set in his ways with conventional thi...,He ended up buying the truck he wanted despite...
4,c71bb23b-7731-4233-8298-76ba6886cee1,Overcoming shortcomings,John was a pastor with a very bad memory.,He tried to memorize his sermons many days in ...,He decided to learn to sing to overcome his ha...,He then made all his sermons into music and sa...,His congregation was delighted and so was he.


In [None]:
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")

Adequacy (Is the meaning preserved adequately?)

Fluency (Is the paraphrase fluent English?)

Diversity (Lexical / Phrasal / Syntactical) (How much has the paraphrase changed the original sentence?)



In [17]:
phrases = data["sentence2"].values.tolist()[:2]
for phrase in phrases:
  print("-"*100)
  print("Input_phrase: ", phrase)
  print("-"*100)
  para_phrases = parrot.augment(input_phrase=phrase,
                                use_gpu=True,
                               diversity_ranker="levenshtein", #levenshtein #diff
                               do_diverse=9, 
                               max_return_phrases = 10, 
                               adequacy_threshold = 0.80, 
                               fluency_threshold = 0.90)
  
  for para_phrase in para_phrases:
   print(para_phrase)

----------------------------------------------------------------------------------------------------
Input_phrase:  He examined his habits to try and figure out the reason.
----------------------------------------------------------------------------------------------------
('he took a look at his habits to try to figure out the cause', 30)
('he took a look at his habits to try to figure out the reason', 25)
('he took a look at his habits to try and figure out the reason', 22)
('he examined his habits to try to figure out the reason', 16)
('he examined his habits to try and figure out the reason', 13)
----------------------------------------------------------------------------------------------------
Input_phrase:  One day a guest made him very angry.
----------------------------------------------------------------------------------------------------
('a guest made him angry one day', 29)
('when he was a guest one day he was angry', 25)
('when he was a guest he became very angry', 22)
(

## Quality Controlled Paraphrase Generation (ACL 2022)

source: https://github.com/IBM/quality-controlled-paraphrase-generation

```
@inproceedings{bandel-etal-2022-quality,
    title = "Quality Controlled Paraphrase Generation",
    author = "Bandel, Elron  and
      Aharonov, Ranit  and
      Shmueli-Scheuer, Michal  and
      Shnayderman, Ilya  and
      Slonim, Noam  and
      Ein-Dor, Liat",
    booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = may,
    year = "2022",
    address = "Dublin, Ireland",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.acl-long.45",
}
```

In [18]:
## 

from transformers import pipeline

class QualityControlPipeline:
    
    def __init__(self, type):
        assert type in ['captions', 'questions', 'sentences']
        self.pipe = pipeline('text2text-generation', model=f'ibm/qcpg-{type}')
        self.ranges = {
            'captions': {'lex': [0, 90], 'syn': [0, 80], 'sem': [0, 95]},
            'sentences': {'lex': [0, 100], 'syn': [0, 80], 'sem': [0, 95]},
            'questions': {'lex': [0, 90], 'syn': [0, 75], 'sem': [0, 95]}
        }[type]

    def __call__(self, text, lexical, syntactic, semantic, **kwargs):
        assert all([0 <= val <= 1 for val in [lexical, syntactic, semantic]]), \
                 f' control values must be between 0 and 1, got {lexical}, {syntactic}, {semantic}'
        names = ['semantic_sim', 'lexical_div', 'syntactic_div']
        control = [int(5 * round(val * 100 / 5)) for val in [semantic, lexical, syntactic]]
        control ={name: max(min(val , self.ranges[name[:3]][1]), self.ranges[name[:3]][0]) for name, val in zip(names, control)}
        control = [f'COND_{name.upper()}_{control[name]}' for name in names]
        assert all(cond in self.pipe.tokenizer.additional_special_tokens for cond in control)
        text = ' '.join(control) + text if isinstance(text, str) else [' '.join(control) for t in text]
        return self.pipe(text, **kwargs)

In [19]:
model = QualityControlPipeline('sentences')

Downloading (…)lve/main/config.json: 100%|██████████| 1.50k/1.50k [00:00<00:00, 2.11MB/s]
Downloading pytorch_model.bin: 100%|██████████| 892M/892M [01:18<00:00, 11.4MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 1.69k/1.69k [00:00<00:00, 2.95MB/s]
Downloading spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 1.70MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.40M/1.40M [00:00<00:00, 7.30MB/s]
Downloading (…)in/added_tokens.json: 100%|██████████| 1.79k/1.79k [00:00<00:00, 2.56MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 1.48k/1.48k [00:00<00:00, 3.21MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
for phrase in phrases:
  print("-"*100)
  print("Input_phrase: ", phrase)
  print("-"*100)
  para_phrases = model(phrase, lexical=0.3, syntactic=0.5, semantic=0.8)
  
  for para_phrase in para_phrases:
   print(para_phrase)



----------------------------------------------------------------------------------------------------
Input_phrase:  He examined his habits to try and figure out the reason.
----------------------------------------------------------------------------------------------------
{'generated_text': "He's examined his behavior and he's looking for a reason."}
----------------------------------------------------------------------------------------------------
Input_phrase:  One day a guest made him very angry.
----------------------------------------------------------------------------------------------------
{'generated_text': 'A guest of his own made him very angry one day.'}


## chatGPT paraphraser

source: https://huggingface.co/humarin/chatgpt_paraphraser_on_T5_base

@inproceedings{chatgpt_paraphraser,
  author={Vladimir Vorobev, Maxim Kuznetsov},
  title={A paraphrasing model based on ChatGPT paraphrases},
  year={2023}
}

In [27]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cuda"

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to(device)
    
    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res


In [28]:
for phrase in phrases:
  print("-"*100)
  print("Input_phrase: ", phrase)
  print("-"*100)
  para_phrases = paraphrase(phrase)
  
  for para_phrase in para_phrases:
    print(para_phrase)


----------------------------------------------------------------------------------------------------
Input_phrase:  He examined his habits to try and figure out the reason.
----------------------------------------------------------------------------------------------------




He re-examined his habits to ascertain the cause.
He scrutinized his behavior to ascertain the cause.
In an effort to pinpoint the cause, he scrutinized his behavior.
To determine the reason, he examined his habits.
His behavior was scrutinized to determine the cause.
----------------------------------------------------------------------------------------------------
Input_phrase:  One day a guest made him very angry.
----------------------------------------------------------------------------------------------------
He was extremely angry with a guest on one occasion.
A guest caused him to become extremely angry.
An unexpected guest caused him to become extremely angry.
One day, he was so upset by a guest that.
On a particular day, he was infuriated by.


## From Databases to {QA} Semantic Parsers with Only Synthetic Training Data

source: https://www.aclweb.org/anthology/2020.emnlp-main.31

https://huggingface.co/stanford-oval/paraphraser-bart-large?text=He+examined+his+habits+to+try+and+figure+out+the+reason
