In [None]:
!pip install 'git+https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup'

# MAIN

In [None]:
import os
import gc
import copy
import time
import random
import string
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from glob import glob
from tqdm.notebook import tqdm

from collections import defaultdict
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW
from cosine_annealing_warmup import CosineAnnealingWarmupRestarts

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
CONFIG = {
    "seed": 42,
    "epochs": 3,
    "model_name": "unitary/multilingual-toxic-xlm-roberta",
    "train_batch_size": 8,
    "valid_batch_size": 32,
    "max_length": 128,
    "learning_rate": 1e-5,
    "scheduler": 'CosineAnnealingWarmupRestarts', #'MultiStepLR',
    "min_lr": 1e-6,
    "T_mult": 1,
    "warmup_steps": 10,
    "gamma": 1,
    "weight_decay": 2e-5,
    "n_fold": 5,
    "n_accumulate": 1,
    "num_classes": 1,
    "margin": 0.5,
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
}

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
CONFIG['group'] = CONFIG['model_name'] + '_Baseline'
set_seed(CONFIG['seed'])

In [None]:
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.more_toxic = df['more_toxic'].values
        self.less_toxic = df['less_toxic'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        more_toxic = self.more_toxic[index]
        less_toxic = self.less_toxic[index]
        inputs_more_toxic = self.tokenizer.encode_plus(
            more_toxic,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length'
        )
        inputs_less_toxic = self.tokenizer.encode_plus(
            less_toxic,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length'
        )
        target = 1
        
        more_toxic_ids = inputs_more_toxic['input_ids']
        more_toxic_mask = inputs_more_toxic['attention_mask']
        
        less_toxic_ids = inputs_less_toxic['input_ids']
        less_toxic_mask = inputs_less_toxic['attention_mask']
        
        
        return {
            'more_toxic_ids': torch.tensor(more_toxic_ids, dtype=torch.long),
            'more_toxic_mask': torch.tensor(more_toxic_mask, dtype=torch.long),
            'less_toxic_ids': torch.tensor(less_toxic_ids, dtype=torch.long),
            'less_toxic_mask': torch.tensor(less_toxic_mask, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }
    


class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "attention_probs_dropout_prob": 0.0,
        })
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(
            input_ids=ids,
            attention_mask=mask,
        )
        outputs = self.linear(out.last_hidden_state[:, 0, :])
        return outputs

In [None]:
def criterion(outputs1, outputs2, targets):
    return nn.MarginRankingLoss(margin=CONFIG['margin'])(outputs1, outputs2, targets)


def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
        loss = loss / CONFIG['n_accumulate']
        loss.backward()
    
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
    
        bar.set_postfix(
            Epoch=epoch,
            Train_Loss=epoch_loss,
            LR=optimizer.param_groups[0]['lr']
        )
        
    gc.collect()
    
    return epoch_loss


@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)

        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        bar.set_postfix(
            Epoch=epoch,
            Valid_Loss=epoch_loss,
            LR=optimizer.param_groups[0]['lr']
        )
    gc.collect()
    return epoch_loss


@torch.no_grad()
def valid_trainingdata(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    MORE_TOXIC_PREDS = []
    LESS_TOXIC_PREDS = []

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
        
        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        
        MORE_TOXIC_PREDS.append(more_toxic_outputs.view(-1).cpu().detach().numpy())
        LESS_TOXIC_PREDS.append(less_toxic_outputs.view(-1).cpu().detach().numpy())

    MORE_TOXIC_PREDS = np.concatenate(MORE_TOXIC_PREDS)
    LESS_TOXIC_PREDS = np.concatenate(LESS_TOXIC_PREDS)
    gc.collect()
    
    return MORE_TOXIC_PREDS, LESS_TOXIC_PREDS


def run_training(model, optimizer, scheduler, device, num_epochs, fold):
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(
            model,
            optimizer,
            scheduler, 
            dataloader=train_loader, 
            device=CONFIG['device'],
            epoch=epoch
        )
        
        val_epoch_loss = valid_one_epoch(
            model,
            valid_loader,
            device=CONFIG['device'], 
            epoch=epoch
        )
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        
        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
            print(f"Validation Loss Improved ({best_epoch_loss}->{val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"Loss-Fold-{fold}.bin"
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved")
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60
    ))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history


def prepare_loaders(df, fold):
    df_train = df[df.fold != fold].reset_index(drop=True)
    df_valid = df[df.fold == fold].reset_index(drop=True)
    
    train_dataset = JigsawDataset(
        df_train,
        tokenizer=CONFIG['tokenizer'],
        max_length=CONFIG['max_length']
    )
    valid_dataset = JigsawDataset(
        df_valid, 
        tokenizer=CONFIG['tokenizer'], 
        max_length=CONFIG['max_length']
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG['train_batch_size'], 
        num_workers=2,
        shuffle=True,
        pin_memory=True,
        drop_last=True
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=CONFIG['valid_batch_size'], 
        num_workers=2,
        shuffle=False,
        pin_memory=True
    )
    
    return train_loader, valid_loader


def fetch_scheduler(optimizer, size=None):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=CONFIG['T_max'], 
            eta_min=CONFIG['min_lr']
        )
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmupRestarts':
        scheduler = CosineAnnealingWarmupRestarts(
            optimizer,
            first_cycle_steps=size,
            cycle_mult=CONFIG['T_mult'],
            max_lr=CONFIG['learning_rate'],
            min_lr=CONFIG['min_lr'],
            warmup_steps=CONFIG['warmup_steps'],
            gamma=CONFIG['gamma']
        )
    elif CONFIG['scheduler'] == 'MultiStepLR':
        scheduler = lr_scheduler.MultiStepLR(
            optimizer,
            milestones=[size * i + 1 for i in range(1, CONFIG['epochs'])],
            gamma=0.1
        )
    elif CONFIG['scheduler'] == None:
        return None
    
    return scheduler

In [None]:
class UnionFind():
    def __init__(self, n):
        self.n = n
        self.parents = [-1] * n

    def find(self, x):
        if self.parents[x] < 0:
            return x
        else:
            self.parents[x] = self.find(self.parents[x])
            return self.parents[x]

    def union(self, x, y):
        x = self.find(x)
        y = self.find(y)
        if x == y:
            return
        if self.parents[x] > self.parents[y]:
            x, y = y, x
        self.parents[x] += self.parents[y]
        self.parents[y] = x

        
def get_group_unionfind(train: pd.DataFrame):
    unique_text = pd.concat([train['less_toxic'], train['more_toxic']]).unique()
    text2num = {text: i for i, text in enumerate(unique_text)}
    num2text = {num: text for text, num in text2num.items()}
    train['num_less_toxic'] = train['less_toxic'].map(text2num)
    train['num_more_toxic'] = train['more_toxic'].map(text2num)

    uf = UnionFind(len(unique_text))
    for seq1, seq2 in train[['num_less_toxic', 'num_more_toxic']].to_numpy():
        uf.union(seq1, seq2)

    text2group = {num2text[i]: uf.find(i) for i in range(len(unique_text))}
    train['group'] = train['less_toxic'].map(text2group)
    train = train.drop(columns=['num_less_toxic', 'num_more_toxic'])
    return train


train = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

# get fold
train = get_group_unionfind(train)
train['fold'] = -1
group_kfold = GroupKFold(n_splits=5)
for fold, (trn_idx, val_idx) in enumerate(group_kfold.split(train, train, train['group'])): 
    train.loc[val_idx, "fold"] = fold

In [None]:
for fold in range(0, CONFIG['n_fold']):
    print(f"====== Fold: {fold} ======")
    
    # Create Dataloaders
    train_loader, valid_loader = prepare_loaders(train, fold=fold)
    size = train_loader.__len__()
    print('train num_batch:', size)
    
    model = JigsawModel(CONFIG['model_name'])
    model.to(CONFIG['device'])
    
    # Define Optimizer and Scheduler
    optimizer = AdamW(
        model.parameters(),
        lr=CONFIG['learning_rate'],
        weight_decay=CONFIG['weight_decay']
    )
    scheduler = fetch_scheduler(optimizer, size=size)
    
    model, history = run_training(
        model, 
        optimizer, 
        scheduler,
        device=CONFIG['device'],
        num_epochs=CONFIG['epochs'],
        fold=fold
    )
    
    more_pred, less_pred = valid_trainingdata(
        model,
        valid_loader,
        device=CONFIG['device']
    )
    train.loc[train['fold']==fold, 'more_pred'] = more_pred
    train.loc[train['fold']==fold, 'less_pred'] = less_pred
    
    del model, history, train_loader, valid_loader
    gc.collect()
    
score = round((train['more_pred'] > train['less_pred']).mean(), 5)
print(f'CV: {score}')

train.to_csv('train_pred.csv', index=False)