## Data preparation

### Download data

In [48]:
import requests
from urllib.parse import urlencode
import os

base_url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download?'
public_key = 'https://disk.yandex.ru/d/OdH_N65L66Mj7g'

final_url = base_url + urlencode(dict(public_key=public_key))
response = requests.get(final_url)
download_url = response.json()['href']

download_response = requests.get(download_url)

abs_lib = os.path.abspath('../')
path = abs_lib + '/data/raw'
try:  
    os.mkdir(path)  
except OSError as error:  
    pass

with open('../data/raw/filtered.tsv', 'wb') as f:  
    f.write(download_response.content)

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('../data/raw/dataset.csv',index_col=0)

for i, row in df.iterrows():
    if row['trn_tox'] > row['ref_tox']:
        df.at[i, 'trn_tox'] = row['ref_tox']
        df.at[i, 'ref_tox'] = row['trn_tox']
        df.at[i, 'reference'] = row['translation']
        df.at[i, 'translation'] = row['reference']

df['diff_tox'] = df['ref_tox'] - df['trn_tox']
df['ref_words']= df['reference'].str.split().apply(len)
df['trn_words']= df['translation'].str.split().apply(len)

df.drop(df[((df['ref_words'] > 127) | (df['trn_words'] > 127))].index,inplace=True)
df.to_csv('../data/interim/data.csv')

### Manual seed

In [17]:
import torch
import numpy as np
import pandas as pd
torch.manual_seed(0)
np.random.seed(0)

### Read data

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/interim/data.csv',index_col=0)
df_subset, _ = train_test_split(df,train_size=0.15,shuffle=True)
train, test = train_test_split(df_subset, test_size=0.1,shuffle=True)

# Models name initialization

In [19]:
import torch
import evaluate
from transformers import pipeline, RobertaTokenizer, RobertaForSequenceClassification
from sentence_transformers import SentenceTransformer


torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

paraphrase_model_name = 'humarin/chatgpt_paraphraser_on_T5_base'
toxicity_classifier_name = 'SkolkovoInstitute/roberta_toxicity_classifier'
similarity_embedder_name = 'sentence-transformers/all-MiniLM-L6-v2'
sacrebleu_name = 'sacrebleu'

## Train paraphraser

In [20]:
import datasets

dataset_train = datasets.Dataset.from_pandas(train)
dataset_test = datasets.Dataset.from_pandas(test)

dataset = dataset_train.train_test_split(test_size=0.1)

dataset = datasets.DatasetDict({'train': dataset['train'], "test": dataset_test, "val": dataset['test']})
dataset

DatasetDict({
    train: Dataset({
        features: ['reference', 'translation', 'similarity', 'lenght_diff', 'ref_tox', 'trn_tox', 'diff_tox', 'ref_words', 'trn_words', '__index_level_0__'],
        num_rows: 70132
    })
    test: Dataset({
        features: ['reference', 'translation', 'similarity', 'lenght_diff', 'ref_tox', 'trn_tox', 'diff_tox', 'ref_words', 'trn_words', '__index_level_0__'],
        num_rows: 8659
    })
    val: Dataset({
        features: ['reference', 'translation', 'similarity', 'lenght_diff', 'ref_tox', 'trn_tox', 'diff_tox', 'ref_words', 'trn_words', '__index_level_0__'],
        num_rows: 7793
    })
})

In [21]:
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer

para_tokenizer = AutoTokenizer.from_pretrained(paraphrase_model_name)
para_model = AutoModelForSeq2SeqLM.from_pretrained(paraphrase_model_name).to(torch_device)

In [22]:
from transformers import DataCollatorForSeq2Seq,Seq2SeqTrainingArguments, Seq2SeqTrainer

data_collator_para = DataCollatorForSeq2Seq(tokenizer=para_tokenizer, model=para_model)

In [23]:
from peft import LoraModel,LoraConfig,TaskType,get_peft_model

peft_config = LoraConfig(
        r=16, 
        lora_alpha=32,
        target_modules=["q", "v"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.SEQ_2_SEQ_LM
    )
peft_para_model = get_peft_model(para_model,peft_config).to(torch_device)

In [24]:
def preprocess_para(examples):
    prefix = 'Paraphrase: '
    model_inputs = para_tokenizer([prefix + example for example in examples['reference']], text_target=examples['translation'], truncation=True,padding='longest', return_tensors="pt")
    return model_inputs

para_preprocessed = dataset.map(preprocess_para, batched=True)

Map:   0%|          | 0/70132 [00:00<?, ? examples/s]

Map:   0%|          | 0/8659 [00:00<?, ? examples/s]

Map:   0%|          | 0/7793 [00:00<?, ? examples/s]

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="tensorboard/paraphrase_finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    logging_steps=10,
    # predict_with_generate=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=peft_para_model,
    args=training_args,
    train_dataset=para_preprocessed["train"],
    eval_dataset=para_preprocessed["val"],
    tokenizer=para_tokenizer,
    data_collator=data_collator_para,
    # compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/13152 [00:00<?, ?it/s]

{'loss': 16.2012, 'learning_rate': 1.9984793187347933e-05, 'epoch': 0.0}
{'loss': 16.5727, 'learning_rate': 1.9969586374695867e-05, 'epoch': 0.0}
{'loss': 16.0383, 'learning_rate': 1.99543795620438e-05, 'epoch': 0.01}
{'loss': 15.5488, 'learning_rate': 1.993917274939173e-05, 'epoch': 0.01}
{'loss': 15.6395, 'learning_rate': 1.992396593673966e-05, 'epoch': 0.01}
{'loss': 15.5681, 'learning_rate': 1.9908759124087592e-05, 'epoch': 0.01}
{'loss': 15.0755, 'learning_rate': 1.9893552311435523e-05, 'epoch': 0.02}
{'loss': 14.2868, 'learning_rate': 1.9878345498783454e-05, 'epoch': 0.02}
{'loss': 13.9076, 'learning_rate': 1.9864659367396595e-05, 'epoch': 0.02}
{'loss': 13.4564, 'learning_rate': 1.984945255474453e-05, 'epoch': 0.02}
{'loss': 13.2906, 'learning_rate': 1.983424574209246e-05, 'epoch': 0.03}
{'loss': 12.6786, 'learning_rate': 1.981903892944039e-05, 'epoch': 0.03}
{'loss': 12.5238, 'learning_rate': 1.9803832116788323e-05, 'epoch': 0.03}
{'loss': 11.9065, 'learning_rate': 1.9788625304

  0%|          | 0/488 [00:00<?, ?it/s]

{'eval_loss': 0.39518237113952637, 'eval_runtime': 41.3294, 'eval_samples_per_second': 188.558, 'eval_steps_per_second': 11.808, 'epoch': 1.0}
{'loss': 0.4515, 'learning_rate': 1.3328771289537714e-05, 'epoch': 1.0}
{'loss': 0.4518, 'learning_rate': 1.3313564476885645e-05, 'epoch': 1.0}
{'loss': 0.4151, 'learning_rate': 1.3298357664233578e-05, 'epoch': 1.01}
{'loss': 0.4643, 'learning_rate': 1.3283150851581509e-05, 'epoch': 1.01}
{'loss': 0.4429, 'learning_rate': 1.3267944038929442e-05, 'epoch': 1.01}
{'loss': 0.435, 'learning_rate': 1.3252737226277373e-05, 'epoch': 1.01}
{'loss': 0.3448, 'learning_rate': 1.3237530413625306e-05, 'epoch': 1.02}
{'loss': 0.4345, 'learning_rate': 1.3222323600973237e-05, 'epoch': 1.02}
{'loss': 0.4091, 'learning_rate': 1.3207116788321168e-05, 'epoch': 1.02}
{'loss': 0.403, 'learning_rate': 1.3191909975669101e-05, 'epoch': 1.02}
{'loss': 0.4413, 'learning_rate': 1.3176703163017032e-05, 'epoch': 1.02}
{'loss': 0.4783, 'learning_rate': 1.3161496350364963e-05, 

  0%|          | 0/488 [00:00<?, ?it/s]

{'eval_loss': 0.3773025572299957, 'eval_runtime': 45.3024, 'eval_samples_per_second': 172.022, 'eval_steps_per_second': 10.772, 'epoch': 2.0}
{'loss': 0.3989, 'learning_rate': 6.669708029197081e-06, 'epoch': 2.0}
{'loss': 0.4427, 'learning_rate': 6.654501216545013e-06, 'epoch': 2.0}
{'loss': 0.4372, 'learning_rate': 6.639294403892944e-06, 'epoch': 2.01}
{'loss': 0.387, 'learning_rate': 6.624087591240876e-06, 'epoch': 2.01}
{'loss': 0.4296, 'learning_rate': 6.608880778588809e-06, 'epoch': 2.01}
{'loss': 0.4127, 'learning_rate': 6.59367396593674e-06, 'epoch': 2.01}
{'loss': 0.4414, 'learning_rate': 6.578467153284672e-06, 'epoch': 2.01}
{'loss': 0.421, 'learning_rate': 6.563260340632603e-06, 'epoch': 2.02}
{'loss': 0.3899, 'learning_rate': 6.548053527980536e-06, 'epoch': 2.02}
{'loss': 0.3647, 'learning_rate': 6.532846715328468e-06, 'epoch': 2.02}
{'loss': 0.4181, 'learning_rate': 6.517639902676399e-06, 'epoch': 2.02}
{'loss': 0.3843, 'learning_rate': 6.502433090024331e-06, 'epoch': 2.03}

  0%|          | 0/488 [00:00<?, ?it/s]

{'eval_loss': 0.37410688400268555, 'eval_runtime': 44.907, 'eval_samples_per_second': 173.536, 'eval_steps_per_second': 10.867, 'epoch': 3.0}
{'train_runtime': 9758.7155, 'train_samples_per_second': 21.56, 'train_steps_per_second': 1.348, 'train_loss': 0.688838437634663, 'epoch': 3.0}


TrainOutput(global_step=13152, training_loss=0.688838437634663, metrics={'train_runtime': 9758.7155, 'train_samples_per_second': 21.56, 'train_steps_per_second': 1.348, 'train_loss': 0.688838437634663, 'epoch': 3.0})

In [12]:
peft_para_model.save_pretrained('../model/para_ft_2')

## Evaluation

In [25]:
class CosSimilarityPipeline():
    def __init__(self, model) -> None:
        self.model = model

    def _forward(self,inputs):
        embeds = self.model.encode(inputs)
        return embeds

    def _postprocess(self, outputs_inputs, outputs_targets):
        s = []
        for input_embd, target_embd in zip(outputs_inputs,outputs_targets):
            cos_sim = torch.nn.functional.cosine_similarity(torch.tensor(input_embd),torch.tensor(target_embd),dim=0)
            s.append(cos_sim)
        return torch.tensor(s)

    def __call__(self, inputs, targets,reduce_mean=True):
        input_embeds = self._forward(inputs)
        targets_embeds = self._forward(targets)
        sims = self._postprocess(input_embeds, targets_embeds)
        if reduce_mean:
            return torch.mean(sims).item()
        else:
            return sims.numpy()

In [29]:
toxicity_clf_tokenizer = RobertaTokenizer.from_pretrained(toxicity_classifier_name)
toxicity_clf_model = RobertaForSequenceClassification.from_pretrained(toxicity_classifier_name)
toxicity_clf_model.eval()

toxicity_clf_pipeline = pipeline('text-classification',tokenizer=toxicity_clf_tokenizer,model=toxicity_clf_model,device=torch_device)

sentence_similarity_model = SentenceTransformer(similarity_embedder_name)
sentence_similarity_model.eval()
cos_similarit_pipeline = CosSimilarityPipeline(sentence_similarity_model)

sacrebleu = evaluate.load(sacrebleu_name)

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
def evaluate_metrics(toxicity_clf_pipe, similarity_clf_pipe,sacrebleu_metric, inputs, preds):
    with torch.no_grad():
        toxicities = toxicity_clf_pipe(preds)
        avg_tox = 0
        print('Toxicity computed')
        for row in toxicities:
            if row['label'] == 'neutral':
                avg_tox += (1 - row['score']) / len(toxicities)
            else:
                avg_tox += row['score'] / len(toxicities)

        cos_sim = similarity_clf_pipe(inputs,preds)
        print("Cosine similarity computed")

        bleu = sacrebleu_metric.compute(references=inputs, predictions=preds)['score']

    return {"toxicity": avg_tox,"cosine similarity": cos_sim,"bleu": bleu}

In [31]:
baseline_metrics = evaluate_metrics(toxicity_clf_pipeline,cos_similarit_pipeline, sacrebleu,test['reference'].to_list(), test['translation'].to_list())
print("Baseline metrix from dataset:\n", baseline_metrics)

Toxicity computed
Cosine similarity computed
Baseline metrix from dataset:
 {'toxicity': 0.09911557480319434, 'cosine similarity': 0.6993823647499084, 'bleu': 22.748979964592944}


In [32]:
import tqdm.notebook as tqdm

def paraphrase(
    question,
    model,
    tokenizer,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    with torch.no_grad():
        tokenized = tokenizer(
            f'Paraphrase: {question}',
            return_tensors="pt", padding="longest",
            max_length=max_length,
            truncation=True,
        )
        tokenized = tokenized.to(torch_device)
        
        outputs = model.generate(
            **tokenized, temperature=temperature, repetition_penalty=repetition_penalty,
            num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
            num_beams=num_beams, num_beam_groups=num_beam_groups,
            max_length=max_length, diversity_penalty=diversity_penalty
        )

        res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

def generate_single_paraphrases(model,tokenizer, inputs):
    preds = []
    for input in tqdm.tqdm(inputs):
        output = paraphrase(input,model, tokenizer, num_return_sequences=1)
        preds.append(output[0])
    return preds

In [33]:
general_model_preds = generate_single_paraphrases(peft_para_model,para_tokenizer, test['reference'])

  0%|          | 0/8659 [00:00<?, ?it/s]



In [39]:
general_model_metrics = evaluate_metrics(toxicity_clf_pipeline,cos_similarit_pipeline, sacrebleu, test['reference'].to_list(),general_model_preds)
print("Stock model:\n", general_model_metrics)

Toxicity computed
Cosine similarity computed
Stock model:
 {'toxicity': 0.3095971209607241, 'cosine similarity': 0.716109037399292, 'bleu': 19.189722333388502}


In [36]:
from peft import PeftConfig, PeftModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

config = PeftConfig.from_pretrained('../model/para_ft_2')
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, '../model/para_ft_2').to(torch_device)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 59332935-8948-4750-8b67-43c3a63e5ea6)')' thrown while requesting HEAD https://huggingface.co/humarin/chatgpt_paraphraser_on_T5_base/resolve/main/config.json


In [37]:
ft_model_preds = generate_single_paraphrases(model,tokenizer, test['reference'])

  0%|          | 0/8659 [00:00<?, ?it/s]



In [38]:
ft_model_metrics = evaluate_metrics(toxicity_clf_pipeline,cos_similarit_pipeline, sacrebleu,test['reference'].to_list(),ft_model_preds)
print("Fine-tuned model:\n", ft_model_metrics)

Toxicity computed
Cosine similarity computed
Fine-tuned model:
 {'toxicity': 0.5143966753414774, 'cosine similarity': 0.9037672281265259, 'bleu': 57.85035243169158}


In [77]:
from peft import PeftConfig, PeftModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

config = PeftConfig.from_pretrained('../model/para_ft_2')
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, '../model/para_ft_2').to(torch_device)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

def detoxify(text, model, tokenizer, toxicity_evaluator_pipeline, cos_sim_pipeline, num_generate_sequences=5,return_metrics=True):
    with torch.no_grad():
        paraphrased_texts = paraphrase("Paraphrase: " + text, model, tokenizer,num_return_sequences=num_generate_sequences,num_beams=5 if num_generate_sequences ==1 else num_generate_sequences)
        toxicities = toxicity_evaluator_pipeline(paraphrased_texts)
        postprocessed_detoxicities = []
        for row in toxicities:
            if row['label'] == 'neutral':
                postprocessed_detoxicities.append( row['score'])
            else:
                postprocessed_detoxicities.append(1-row['score'])
            
        similarities = cos_sim_pipeline(paraphrased_texts, [text]*num_generate_sequences,reduce_mean=False)

        if num_generate_sequences > 1:
            means = np.mean(np.stack([postprocessed_detoxicities, similarities],axis=1),axis=1)
            best_id = np.argmax(means)
            output = paraphrased_texts[best_id]
            metrics = {'toxicity': 1 - postprocessed_detoxicities[best_id], 'similarity': similarities[best_id]}
        else:
            output = paraphrased_texts[0]
            metrics = {'toxicity': 1 - postprocessed_detoxicities[0], 'similarity': similarities[0]}

    if return_metrics:
        return output, metrics
    else:
        return output
    

def evaluate_chained_model(model,tokenizer, inputs):
    preds = []
    avg_tox = 0.0
    avg_sim = 0.0
    for input in tqdm.tqdm(inputs):
        output, metrics = detoxify(input ,model, tokenizer, toxicity_clf_pipeline, cos_similarit_pipeline,num_generate_sequences=10)
        preds.append(output)
        avg_tox += metrics['toxicity'] / len(inputs)
        avg_sim += metrics['similarity'] / len(inputs)
    bleu = sacrebleu.compute(references=inputs, predictions=preds)['score']
    return {"toxicity": avg_tox,"cosine similarity": avg_sim,"bleu": bleu}
    

In [None]:
ft_chained_models_metrics = evaluate_chained_model(model, tokenizer, test['reference'])

In [80]:
print(ft_chained_models_metrics)

{'toxicity': 0.019775867462158203, 'cosine similarity': 0.5927732586860657, 'bleu': 31.947155212313625}


# Inference

In [49]:
from peft import PeftConfig, PeftModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

config = PeftConfig.from_pretrained('../model/para_ft_2')
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, '../model/para_ft_2').to(torch_device)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

In [50]:
def detoxify(text, model, tokenizer, toxicity_evaluator_pipeline, cos_sim_pipeline, num_generate_sequences=5,return_metrics=True):
    with torch.no_grad():
        paraphrased_texts = paraphrase("Paraphrase: " + text, model, tokenizer,num_return_sequences=num_generate_sequences,num_beams=5 if num_generate_sequences ==1 else num_generate_sequences)
        toxicities = toxicity_evaluator_pipeline(paraphrased_texts)
        postprocessed_detoxicities = []
        for row in toxicities:
            if row['label'] == 'neutral':
                postprocessed_detoxicities.append( row['score'])
            else:
                postprocessed_detoxicities.append(1-row['score'])
            
        similarities = cos_sim_pipeline(paraphrased_texts, [text]*num_generate_sequences,reduce_mean=False)

        if num_generate_sequences > 1:
            means = np.mean(np.stack([postprocessed_detoxicities, similarities],axis=1),axis=1)
            best_id = np.argmax(means)
            output = paraphrased_texts[best_id]
            metrics = {'toxicity': 1 - postprocessed_detoxicities[best_id], 'similarity': similarities[best_id]}
        else:
            output = paraphrased_texts[0]
            metrics = {'toxicity': 1 - postprocessed_detoxicities[0], 'similarity': similarities[0]}

    if return_metrics:
        return output, metrics
    else:
        return output
    

In [52]:
detoxify(test['reference'].iloc[5], model, tokenizer, toxicity_clf_pipeline, cos_similarit_pipeline,num_generate_sequences=10)

("you're a clever little guy.",
 {'toxicity': 0.0006096363067626953, 'similarity': 0.83392817})