In [1]:
%%capture
!pip install sentencepiece
!pip install transformers 

In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
MODEL_NAME = 'cointegrated/rut5-base-paraphraser'
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/724 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/977M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/828k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/315 [00:00<?, ?B/s]

In [3]:
model.cuda();
model.eval();

In [4]:
def paraphrase(text, beams=5, grams=4, do_sample=False):
    x = tokenizer(
        text,
        return_tensors='pt',
        padding=True
        ).to(model.device)
    max_size = int(x.input_ids.shape[1] * 1.5 + 10)
    out = model.generate(
        **x,
        encoder_no_repeat_ngram_size=grams,
        num_beams=beams,
        max_length=max_size,
        do_sample=do_sample
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)

In [5]:
!wget https://raw.githubusercontent.com/shitkov/news_classification/main/titles.csv

--2021-08-31 14:15:18--  https://raw.githubusercontent.com/shitkov/news_classification/main/titles.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607248 (593K) [text/plain]
Saving to: ‘titles.csv’


2021-08-31 14:15:18 (22.0 MB/s) - ‘titles.csv’ saved [607248/607248]



In [6]:
import pandas as pd

In [7]:
data = pd.read_csv('/content/titles.csv', index_col=0)

In [8]:
titles = list(data[data['label'] == 1]['title'])

In [9]:
titles_neg = list(data[data['label'] == 0]['title'])

In [10]:
train_titles = titles[:250]

In [11]:
test_titles = titles[250:]

In [12]:
train_titles_neg = titles_neg[79:]

In [13]:
test_titles_neg = titles_neg[:79]

In [14]:
test_headlines = pd.DataFrame()
test_headlines['title'] = test_titles + test_titles_neg
test_headlines['label'] = [1]*79 + [0]*79

In [15]:
train_headlines = pd.DataFrame()
train_headlines['title'] = train_titles + train_titles_neg
train_headlines['label'] = [1]*len(train_titles) + [0]*len(train_titles_neg)

In [16]:
from tqdm import tqdm

In [17]:
epoch = 6
new_titles = []
for _ in range(epoch):
    for t in tqdm(train_titles):
         new_titles.append(paraphrase(t, do_sample=True))

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
100%|██████████| 250/250 [01:40<00:00,  2.50it/s]
100%|██████████| 250/250 [01:39<00:00,  2.51it/s]
100%|██████████| 250/250 [01:38<00:00,  2.55it/s]
100%|██████████| 250/250 [01:37<00:00,  2.56it/s]
100%|██████████| 250/250 [01:38<00:00,  2.54it/s]
100%|██████████| 250/250 [01:38<00:00,  2.53it/s]


In [18]:
# deduplicate
new_titles = list(set(new_titles))

In [19]:
# save new dataset
paraphrase_headlines = pd.DataFrame()
paraphrase_headlines['title'] = new_titles
paraphrase_headlines['label'] = 1

In [20]:
# concatination
headlines_train = pd.concat([paraphrase_headlines, train_headlines])

headlines_train.to_csv('/content/headlines_train.csv' ,index=False)
test_headlines.to_csv('/content/headlines_test.csv' ,index=False)