In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd
from tqdm import tqdm
from sage.spelling_corruption import SBSCConfig, SBSCCorruptor
from sage.utils import DatasetsAvailable, load_available_dataset_from_hf, draw_and_save_errors_distributions_comparison_charts
from sage.spelling_corruption.sbsc.labeler import process_mistypings
from sage.spelling_corruption import CharAugConfig, CharAugCorruptor

In [2]:
train = pd.read_csv("/work/hack/train_dataset.csv")

In [3]:
MODEL_NAME = 'cointegrated/rut5-base-paraphraser'
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model.cuda();
model.eval();

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
def paraphrase(text, beams=5, grams=4, do_sample=False):
    x = tokenizer(text, return_tensors='pt', padding=True).to(model.device)
    max_size = int(x.input_ids.shape[1] * 1.5 + 10)
    out = model.generate(**x, encoder_no_repeat_ngram_size=grams, num_beams=beams, max_length=max_size, do_sample=do_sample)
    return tokenizer.batch_decode(out, skip_special_tokens=True)

In [5]:
for_paraphrase = train[train['text'].str.len() > 20].sample(frac=0.10)
texts = for_paraphrase['text'].tolist()
batch_size = 10
paraphrased_data = []
for i in tqdm(range(0, len(texts), batch_size)):
    left, right = i, min(i + batch_size, len(texts))
    paraphrased_data.extend(paraphrase(texts[left:right]))
for_paraphrase['text'] = paraphrased_data

  0%|          | 0/180 [00:00<?, ?it/s]

100%|██████████| 180/180 [20:38<00:00,  6.88s/it]


In [6]:
corruptor = SBSCCorruptor.from_default_config()

Reusing dataset russian_spellcheck_benchmark (/root/.cache/huggingface/datasets/ai-forever___russian_spellcheck_benchmark/RUSpellRU/0.0.1/87bfa2950c7b82ec565b4da426533874af24d25436ad08dba065a45895ad3945)
100%|██████████| 2000/2000 [00:20<00:00, 98.68it/s] 


In [7]:
for_sbsc_corrupt = train.sample(frac=0.1)
for_sbsc_corrupt['text'] = corruptor.batch_corrupt(for_sbsc_corrupt['text'].values)

100%|██████████| 1803/1803 [00:00<00:00, 4967.14it/s]


In [8]:
for_char_level_corrupt = train.sample(frac=0.1)
config = CharAugConfig(
    unit_prob=0.1, # proportion of characters that is going to undergo edits
    min_aug=1, # minimum number of edits
    max_aug=5, # maximum number of edits 
    mult_num=3 # `multiply` edit
)
corruptor = CharAugCorruptor.from_config(config)
for_char_level_corrupt['text'] = corruptor.batch_corrupt(for_char_level_corrupt['text'].values)

In [9]:
new_train = pd.concat([train, for_paraphrase, for_sbsc_corrupt, for_char_level_corrupt])

In [10]:
new_train.to_csv("train_dataset_augs.csv", index=False)