In [None]:
!pip install nlpaug

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action

from transformers import (AutoModel, 
                          AutoModelForMaskedLM,
                          AutoTokenizer,
                          AutoConfig,
                          AdamW,
                          LineByLineTextDataset,
                          DataCollatorForLanguageModeling,
                          Trainer,
                          TrainingArguments)

from tqdm import tqdm
from collections import defaultdict

import warnings, os, gc, random, re 
warnings.filterwarnings("ignore")

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(42)

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
test_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
df.head()

# Preprocessing and Cleaning
Here we concatenate all the text data and remove any duplicate rows.  
We also clean some noise and punctuation from the input data  
`\n` characters are removed when cleaning, which is important when using `LineByLineTextDataset`

In [None]:
pretrain_text = pd.concat([df.less_toxic, df.more_toxic, test_df.text])
pretrain_text.drop_duplicates(inplace = True)
pretrain_text.reset_index(drop = True, inplace = True)

In [None]:
def clean(data):
    # Clean some punctutations
    data = data.str.replace('\n', ' ')
    data = data.str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters
    data = data.str.replace(r'(")\1+',r'\1')    
    data = data.str.replace(r'([*!?\'])\1\1+\B',r'\1\1')    
    data = data.str.replace(r'(\w)\1\1+\B',r'\1\1')    
    data = data.str.replace(r'(\w)\1+\b',r'\1').str.strip()
    return data

In [None]:
pretrain_text = clean(pretrain_text)

with open('text.txt','w') as f:
    text  = '\n'.join(pretrain_text.tolist())
    f.write(text)

# MLM Pretraining
We'll perform MLM pretraining on our transformer model.  
This usually improves downstream performance when fine-tuning/ensembling.  
We'll also explore using the pretrained model for contextual data augmentation  

This code was based on maunish's excellent CommonLit MLM notebook:    
https://www.kaggle.com/maunish/clrp-pytorch-roberta-pretrain

In [None]:
class cfg:
    model_name = 'GroNLP/hateBERT'
    epochs = 3 # adjust
    learning_rate = 5e-05
    train_batch_size = 32
    eval_batch_size = 32
    eval_steps = 200
    block_size = 256
    gradient_accum_steps = 1
    mlm_prob = 0.15
    fp16 = True
    output_dir = './hatebert_mlm'

In [None]:
model = AutoModelForMaskedLM.from_pretrained(cfg.model_name)
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
tokenizer.save_pretrained(cfg.output_dir);

In [None]:
# Sequences are truncated to block size
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="text.txt",
    block_size=cfg.block_size)

valid_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="text.txt",
    block_size=cfg.block_size)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=cfg.mlm_prob)

In [None]:
training_args = TrainingArguments(
    output_dir=cfg.output_dir+'_chk',
    overwrite_output_dir=True,
    num_train_epochs=cfg.epochs,
    per_device_train_batch_size=cfg.train_batch_size,
    per_device_eval_batch_size=cfg.eval_batch_size,
    learning_rate=cfg.learning_rate,
    gradient_accumulation_steps=cfg.gradient_accum_steps,
    fp16=cfg.fp16,
    eval_steps=cfg.eval_steps,
    evaluation_strategy='steps',
    save_total_limit=2,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=True,
    prediction_loss_only=True,
    report_to='none')

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset)

In [None]:
trainer.train()
trainer.save_model(cfg.output_dir)

# Data Augmentation
`nlpaug` helps with generating synthetic data for augmentating NLP pipelines.  
The library also makes it simple to use context-aware augmentations such as MLM for sentence augmentation  
We'll go through some augmentation methods, before testing out our pretrained `HateBert` model  

A list of supported augmentation strategies can be found in the documentation:  
https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb

## Synonym Replacement

In [None]:
example_text = pretrain_text[7500]
aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(example_text)

print("Original:")
print(example_text)
print('\n')
print("Augmented Text:")
print(augmented_text)

## Back Translation

In [None]:
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en')

In [None]:
example_text = pretrain_text[200]
augmented_text = back_translation_aug.augment(example_text)

print("Original:")
print(example_text)
print('\n')
print("Augmented Text:")
print(augmented_text)

## Contextual (Word Embeddings) Augmentation

In [None]:
# substitute is peforming MLM augmentation
aug = naw.ContextualWordEmbsAug(model_path=cfg.output_dir,
                                action='substitute',
                                aug_p=0.15,
                                device='cuda')

In [None]:
example_text = pretrain_text[7]
augmented_text = aug.augment(example_text)

print("Original:")
print(example_text)
print('\n')
print("Augmented Text:")
print(augmented_text)