In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

# Loading the model

In [2]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizerFast
model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase")
tokenizer = PegasusTokenizerFast.from_pretrained("tuner007/pegasus_paraphrase")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

# Examples of initial paraphrasing

In [3]:
tokenizer.batch_decode(
    model.generate(
        tokenizer.encode("At least one of you Dunham cunts are gonna pay for my fucking boy.", return_tensors='pt'),
        temperature=1.0
    ),
    skip_special_tokens=True)

['One of you will pay for my boy.']

In [37]:
tokenizer.batch_decode(
    model.generate(
        tokenizer.encode("you can't do both of me crazy!", return_tensors='pt'),
        temperature=1.0
    ),
    skip_special_tokens=True)

["You can't do both of me crazy!"]

In [5]:
tokenizer.batch_decode(
    model.generate(
        tokenizer.encode("He is shit.", return_tensors='pt'),
        temperature=1.0
    ),
    skip_special_tokens=True)

['He is not good.']

# Dataset building

In [6]:
import pandas as pd
import os
class MyDataset(Dataset):
    def __init__(self):
        self.data = pd.read_csv(
            os.path.join(os.getcwd(), '..', 'input', 'testing-data', 'test.csv'),
            sep='\t'
        )
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return [self.data['toxic_sentence'][idx], self.data['neutral_sentence'][idx]]

In [7]:
dataset = MyDataset()

In [49]:
def collate_fn(batch):
    batch = np.array(batch)
    toxic, neutral = batch[:, 0], batch[:, 1]
    toxic = tokenizer.batch_encode_plus(toxic.tolist(), add_special_tokens=True,
                                        padding='max_length', return_tensors='pt')['input_ids']
    
    model_gen = model.generate(toxic, temperature=1.0)
    
    neutral = tokenizer.batch_encode_plus(neutral.tolist(), add_special_tokens=True,
                                          truncation=True,
                                          padding='max_length', return_tensors='pt',
                                         max_length=len(model_gen[0]))['input_ids']
    return toxic, neutral, model_gen

test_loader = DataLoader(dataset, batch_size=4, collate_fn=collate_fn)

# Training loop

In [33]:
def train_loop(model, dataloader, num_epoch=1, smaple_size=2):
    model.train()
    pbar = tqdm(enumerate(dataloader), total=smaple_size)
    for epoch in range(num_epoch):
        losses = []
        for idx, batch in pbar:
            if idx == smaple_size:
                break
            
            toxic, neutral, model_gen = batch
            loss = model(input_ids=toxic, decoder_input_ids=model_gen, labels=neutral)[0]
            losses.append(loss.item())
            loss.backward()
            
            pbar.set_postfix({
                'Epoch': f'{epoch + 1}/{num_epoch}',
                'Batch': f'{idx + 1}/{smaple_size}',
                'Loss': f"{sum(losses) / len(losses)}"
            })
    model.eval()

In [36]:
train_loop(model, test_loader, smaple_size=10)

100%|██████████| 10/10 [03:07<00:00, 18.75s/it, Epoch=1/1, Batch=10/10, Loss=9.836823177337646]


In [261]:
dataset[0]

["you can't do both of me crazy!", "You can't fool me twice!"]

In [40]:
model.save_pretrained('model.h5')

In [42]:
return_model = PegasusForConditionalGeneration.from_pretrained('model.h5')

In [51]:
dataset[10][0], tokenizer.batch_decode(
    return_model.generate(
        tokenizer.encode(dataset[10][0], return_tensors='pt'),
        temperature=1.0
    ),
    skip_special_tokens=True)

('Vega must have been killed after the judge threatened us.',
 ['The judge threatened us.'])

# Downloading the model into zip to download in localy

In [47]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))

In [48]:
download_file('/kaggle/working/model.h5', 'model')