<a href="https://colab.research.google.com/github/shitkov/courses/blob/master/transformers/transformers_shitkov_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Information about the submission

## 1.1 Name and number of the assignment 

DEtoxification - #2

## 1.2 Student name

Konstantin Shitkov

## 1.3 Codalab user ID

shitkov

## 1.4 Additional comments

-

# 2. Technical Report

## 2.1 Methodology 

Baseline: T5

Experiments:
1.   ruT5-base
1.   ruT5-large + extended data

## 2.2 Discussion of results

Baseline: T5

Experiments:
1.   ruT5-base: 0.47
1.   ruT5-large + extended data: 0.53

Увеличить модель и добавить данных - всегда работатет. Данные расширил путем обратной токсификации с использованием модели IlyaGusev/rut5_tox, а также перевода датасета parallel_detoxification_dataset_small.tsv на русский с фильтрацией: классифицировал с помощью целевого берта и выбирал пары токс/детокс.

# 3. Preparation

In [None]:
from google.colab import drive
drive.mount('/drive')

## 3.1 Download the data

In [None]:
%%capture
!wget https://raw.githubusercontent.com/s-nlp/russe_detox_2022/main/data/input/train.tsv
!wget https://raw.githubusercontent.com/s-nlp/russe_detox_2022/main/data/input/dev.tsv
!wget https://raw.githubusercontent.com/s-nlp/russe_detox_2022/main/data/input/test.tsv
!wget https://raw.githubusercontent.com/s-nlp/parallel_detoxification_dataset/main/parallel_detoxification_dataset_small.tsv

!wget https://raw.githubusercontent.com/s-nlp/russe_detox_2022/main/evaluation/ru_detoxification_evaluation.py
!wget https://raw.githubusercontent.com/s-nlp/russe_detox_2022/main/evaluation/ru_detoxification_metrics.py

## 3.1 Requirements

In [None]:
%%capture
!pip install pytorch-crf
!pip install transformers sentencepiece
!pip install googletrans==3.1.0a0

In [None]:
import gc

import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from googletrans import Translator

from typing import Tuple, List, Dict, Union

from tqdm.auto import tqdm, trange

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import T5ForConditionalGeneration, AutoTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel

from ru_detoxification_evaluation import load_model
from ru_detoxification_metrics import evaluate_style
from ru_detoxification_metrics import evaluate_cosine_similarity
from ru_detoxification_metrics import evaluate_cola_relative

In [None]:
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
use_cuda = True

## 3.2 Translate

In [None]:
data = pd.read_csv('parallel_detoxification_dataset_small.tsv', sep='\t')

toxic_comments = list(data['toxic_comment'])
civil_comments = list(data['civil_comment'])

In [None]:
translator = Translator()

In [None]:
toxic_comments_ru = []
civil_comments_ru = []
for tox, detox in tqdm(zip(toxic_comments, civil_comments)):
    toxic_comments_ru.append(translator.translate(tox, dest='ru').text)
    civil_comments_ru.append(translator.translate(detox, dest='ru').text)

## 3.3 Filtering translated data

In [None]:
style_model, style_tokenizer = load_model('SkolkovoInstitute/russian_toxicity_classifier', use_cuda=use_cuda)

In [None]:
toxic_comments_ru_labels = evaluate_style(
    model = style_model,
    tokenizer = style_tokenizer,
    texts = toxic_comments_ru,
    target_label=1,  # 1 is toxic, 0 is neutral
    batch_size=32, 
    verbose=False
)

In [None]:
civil_comments_ru_labels = evaluate_style(
    model = style_model,
    tokenizer = style_tokenizer,
    texts = civil_comments_ru,
    target_label=0,  # 1 is toxic, 0 is neutral
    batch_size=1, 
    verbose=False
)

In [None]:
data = pd.DataFrame()

data['toxic_comment'] = toxic_comments_ru
data['civil_comment'] = civil_comments_ru

data['toxic_label'] = toxic_comments_ru_labels
data['civil_label'] = civil_comments_ru_labels

df_translated = data[(data['toxic_label'] > .5) & (data['civil_label'] > .5)].drop(columns=['toxic_label','civil_label'])

## 3.4 Retoxify civil comments

In [None]:
df = pd.read_csv('train.tsv', sep='\t', index_col='index')
df = df.fillna('')

In [None]:
df_train_toxic = []
df_train_neutral = []

for index, row in df.iterrows():
    references = row[['neutral_comment1', 'neutral_comment2', 'neutral_comment3']].tolist()
    
    for reference in references:
        if len(reference) > 0:
            df_train_toxic.append(row['toxic_comment'])
            df_train_neutral.append(reference)
        else:
            break

In [None]:
df_train = pd.DataFrame()
df_train['toxic_comment'] = df_train_toxic
df_train['civil_comment'] = df_train_neutral

In [None]:
model_name = 'IlyaGusev/rut5_tox'

model_tox = T5ForConditionalGeneration.from_pretrained(model_name).cuda()
tokenizer_tox = AutoTokenizer.from_pretrained(model_name)

In [None]:
def paraphrase_tox(text, model, tokenizer, n=None, max_length='auto', temperature=0.0, beams=3):
    texts = [text] if isinstance(text, str) else text
    inputs = tokenizer(texts, return_tensors='pt', padding=True)['input_ids'].to(model.device)
    if max_length == 'auto':
        max_length = int(inputs.shape[1] * 1.2) + 10
    result = model.generate(
        inputs, 
        num_return_sequences=n or 1, 
        do_sample=False, 
        temperature=temperature, 
        repetition_penalty=3.0, 
        max_length=max_length,
        bad_words_ids=[[2]],  # unk
        num_beams=beams,
    )
    texts = [tokenizer.decode(r, skip_special_tokens=True) for r in result]
    if not n and isinstance(text, str):
        return texts[0]
    return texts

In [None]:
retox = [paraphrase_tox(text, model_tox, tokenizer_tox) for text in tqdm(df_train_neutral)]

In [None]:
df_retox = pd.DataFrame()
df_retox['toxic_comment'] = retox
df_retox['civil_comment'] = df_train_neutral

In [None]:
del model_tox
cleanup()

In [None]:
retox_labels = evaluate_style(
    model = style_model,
    tokenizer = style_tokenizer,
    texts = retox,
    target_label=1,  # 1 is toxic, 0 is neutral
    batch_size=32, 
    verbose=False
)

In [None]:
df_retox['retox_labels'] = retox_labels

In [None]:
df_retox = df_retox[df_retox['retox_labels'] > .5]

In [None]:
del style_model
cleanup()

In [None]:
print(torch.cuda.memory_allocated() // 1073741824)

## 3.5 Add dev data

In [None]:
dev = pd.read_csv('dev.tsv', sep='\t')

In [None]:
dev = dev.fillna('')

In [None]:
df_train_toxic = []
df_train_neutral = []

for index, row in dev.iterrows():
    references = row[['neutral_comment1', 'neutral_comment2', 'neutral_comment3']].tolist()
    
    for reference in references:
        if len(reference) > 0:
            df_train_toxic.append(row['toxic_comment'])
            df_train_neutral.append(reference)
        else:
            break

In [None]:
df_dev = pd.DataFrame()
df_dev['toxic_comment'] = df_train_toxic
df_dev['civil_comment'] = df_train_neutral

In [None]:
df = pd.concat([df_train, df_dev, df_translated, df_retox])
df = df.sample(frac=1)
df = df.reset_index(drop=True)

In [None]:
df.to_csv('df.csv')

In [None]:
df = pd.read_csv('df.csv')

# 4. Train

In [None]:
class PairsDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        assert idx < len(self.x['input_ids'])
        item = {key: val[idx] for key, val in self.x.items()}
        item['decoder_attention_mask'] = self.y['attention_mask'][idx]
        item['labels'] = self.y['input_ids'][idx]
        return item
    
    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n # * 2

In [None]:
class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,
        ) 
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']
        
        return {k: torch.tensor(v) for k, v in batch.items()}

In [None]:
def train_loop(
    model, train_dataloader,
    max_epochs=10,
    lr=3e-5,
    gradient_accumulation_steps=1, 
    cleanup_step=100
):
    cleanup()
    optimizer = torch.optim.Adam(params = [p for p in model.parameters() if p.requires_grad], lr=lr)

    step = 0
    model.train()

    for epoch in trange(max_epochs):
        tq = tqdm(train_dataloader)
        for i, batch in enumerate(tq):
            try:
                batch['labels'][batch['labels']==0] = -100
                loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
                loss.backward()
            except Exception as e:
                print('error on step', i, e)
                loss = None
                cleanup()
                continue           
                
            if i and i % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            if i % cleanup_step == 0:
                cleanup()
                
        model.save_pretrained(f't5_detox')
    cleanup()

In [None]:
model_name = 'sberbank-ai/ruT5-base'

batch_size = 2

In [None]:
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
train_dataset = PairsDataset(tokenizer(df['toxic_comment'].tolist()), tokenizer(df['civil_comment'].tolist()))

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, drop_last=False, shuffle=True, collate_fn=data_collator)

In [None]:
train_loop(
    model=model,
    train_dataloader=train_dataloader,
    max_epochs=10, 
    lr=3e-5,
    gradient_accumulation_steps=1, 
    cleanup_step=100
)

# 5. Test

In [None]:
data = pd.read_csv('test.tsv', sep='\t')
toxic_comments = data['toxic_comment'].tolist()

In [None]:
def paraphrase(text, model, tokenizer):
    inputs = tokenizer([text], return_tensors='pt', padding=True)['input_ids'].to(model.device)
    
    max_length = int(inputs.shape[1] * 1.2) + 10

    result = model.generate(
        inputs, 
        num_return_sequences=3, 
        do_sample=True, 
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        repetition_penalty=3.0, 
        max_length=max_length,
        bad_words_ids=[[2]],  # unk
        num_beams=5,
    )

    texts = [tokenizer.decode(r, skip_special_tokens=True) for r in result]

    return texts

In [None]:
para_results = []

for text in tqdm(toxic_comments):
    try:
        para_results.append(paraphrase(text, model, tokenizer))
    except Exception as e:
        print('ERROR')
        break

In [None]:
del model
cleanup()

### 6. Filtering

In [None]:
n = 3 # num_return_sequences

id_list = []
para_x3 = []
for i, batch in enumerate(para_results):
    para_x3 += batch
    id_list += [i] * 3

In [None]:
df = pd.DataFrame()
df['id'] = id_list
df['text'] = para_x3

## 6.1 CLS score

In [None]:
tokenizer = BertTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
model = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')

In [None]:
def get_cls_score(model, tokenizer, text):
    with torch.no_grad():
        inputs = tokenizer.encode(text, return_tensors='pt').to(model.device)
        logits = model(inputs).logits
        score = torch.softmax(logits, -1)[:, 0].cpu().numpy()[0]
    return score

In [None]:
cls_score = [get_cls_score(model, tokenizer, text) for text in tqdm(para_x3)] 

In [None]:
df['cls'] = cls_score

In [None]:
del model
cleanup()

## 6.2 COSINE similarity

In [None]:
meaning_model, meaning_tokenizer = load_model('cointegrated/LaBSE-en-ru', use_cuda=True, model_class=AutoModel)

In [None]:
cos_score = []

for i, text in enumerate(para_x3):
    x = toxic_comments[int(i//3)]
    y = text

    similarity = evaluate_cosine_similarity(
        model = meaning_model,
        tokenizer = meaning_tokenizer,
        original_texts = [x],
        rewritten_texts = [y],
        batch_size=32,
        verbose=False,
        )[0]
    cos_score.append(similarity)

In [None]:
df['cos'] = cos_score

In [None]:
del meaning_model
cleanup()

## 6.3 FLUENCY score

In [None]:
cola_model, cola_tolenizer = load_model('SkolkovoInstitute/rubert-base-corruption-detector', use_cuda=True)

In [None]:
fl_score = []

for i, text in enumerate(para_x3):
    x = toxic_comments[int(i//3)]
    y = text

    fluency = evaluate_cola_relative(
        model = cola_model,
        tokenizer = cola_tolenizer,
        original_texts = [x],
        rewritten_texts = [y],
        target_label=1,
        batch_size=32,
        verbose=False
    )
    fl_score.append(fluency[0])

In [None]:
df['fl'] = fl_score

In [None]:
del cola_model
cleanup()

## 6.4 MAX J score

In [None]:
df['J'] = list(np.array(df['fl'].tolist()) * np.array(df['cos'].tolist()) * np.array(df['cls'].tolist()))

In [None]:
predicts_clean = []
for id in range(len(toxic_comments)):
    data = df[df['id'] == id]
    idx = data['J'].idxmax()
    predicts_clean.append(data['text'][idx])

# 7. Save results

In [None]:
with open('test.txt', 'w') as file:
    file.writelines([sentence+'\n' for sentence in predicts_clean])

In [None]:
!zip -r test_final.zip test.txt