## Package

In [None]:
import tqdm
import ast

import pandas as pd
import numpy as np

## BERT
import transformers
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

import torch
from torch.utils.data import DataLoader, Dataset, random_split

from sklearn.model_selection import train_test_split

## Import Data

In [None]:
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
pairs = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
submission = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')

## Data Preprocessing

In [None]:
pairs.less_toxic = pairs.less_toxic.apply(lambda x: ' '.join(x.replace('\n', ' ').replace('"', " ").split()))
pairs.more_toxic = pairs.more_toxic.apply(lambda x: ' '.join(x.replace('\n', ' ').replace('"', " ").split()))
comments.text = comments.text.apply(lambda x: ' '.join(x.replace('\n', ' ').replace('"', " ").split()))

In [None]:
pairs.head()

In [None]:
id2comment = {i: c for i, c in enumerate(list(set(pairs.less_toxic.unique().tolist()+pairs.more_toxic.unique().tolist())))}
comment2id = {v:k for (k, v) in id2comment.items()}

In [None]:
pairs['id_l'] = pairs.less_toxic.map(comment2id)
pairs['id_m'] = pairs.more_toxic.map(comment2id)

In [None]:
pairs_count = pairs.groupby(['id_l', 'id_m']).count()['worker'].transpose().to_dict()
pairs_count_new = {}
for pair in pairs_count:
    l, m = pair
    if (m, l) in pairs_count:
        if (l, m) not in pairs_count_new:
            ratio = pairs_count[pair]/(pairs_count[pair]+pairs_count[(m, l)])
            if ratio < 0.5:
                pairs_count_new[(m, l)] = 1-ratio
            else:
                pairs_count_new[(l, m)] = ratio
    else:
        pairs_count_new[(l, m)] = 1

In [None]:
pairs_new = pd.DataFrame.from_dict({'confidence': pairs_count_new}).reset_index()
pairs_new.columns = ['id_l', 'id_m', 'confidence']
pairs_new['less_toxic'] = pairs_new.id_l.map(id2comment)
pairs_new['more_toxic'] = pairs_new.id_m.map(id2comment)
pairs_new.sample(5)

In [None]:
pretrained_bert = "../input/huggingface-bert/bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrained_bert, do_lower_case=True)

In [None]:
pairs_new['pairs'] = pairs_new.less_toxic + '[SEP]' + pairs_new.more_toxic
pairs_new['pairs'] = pairs_new['pairs'].apply(lambda x: x.split('[SEP]'))

In [None]:
%%time
pairs_new['input_ids'] = pairs_new.pairs.apply(lambda x: tokenizer(x, truncation=True, max_length=512))
pairs_new['token_type_ids'] = pairs_new['input_ids'].apply(lambda x: x['token_type_ids'])
pairs_new['attention_mask'] = pairs_new['input_ids'].apply(lambda x: x['attention_mask'])
pairs_new['input_ids']      = pairs_new['input_ids'].apply(lambda x: x['input_ids'])
comments['input_ids'] = comments.text.apply(lambda x: tokenizer(x, truncation=True, max_length=512))
comments['token_type_ids'] = comments['input_ids'].apply(lambda x: x['token_type_ids'])
comments['attention_mask'] = comments['input_ids'].apply(lambda x: x['attention_mask'])
comments['input_ids']      = comments['input_ids'].apply(lambda x: x['input_ids'])

In [None]:
X_train, X_test = train_test_split(pairs_new, test_size=0.2, random_state=0)
X_valid, X_test = train_test_split(X_test, test_size=0.5, random_state=0)

SIZE_OF_TRAIN = len(X_train)
SIZE_OF_VALID = len(X_valid)
SIZE_OF_TEST  = len(X_test)

print(SIZE_OF_TRAIN, SIZE_OF_VALID, SIZE_OF_TEST)

In [None]:
X_train.sample()

In [None]:
comments.sample()

## Loader

In [None]:
def pad_to_len(seqs, to_len, padding=0):
    paddeds = []
    for seq in seqs:
        paddeds.append(
            seq[:to_len] + [padding] * max(0, to_len - len(seq))
        )
    return paddeds

def pad_to_len_pair(seqs, to_len, padding=0):
    paddeds = []
    for seq in seqs:
        new_pair = []
        for pair in seq:
            new_pair.append(
                pair[:to_len] + [padding] * max(0, to_len - len(pair))
            )
        paddeds.append(new_pair)
    return paddeds

In [None]:
class dataset(Dataset):
    def __init__(self, data):
        self.data  = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        sample = self.data[index]
        return sample
    
    def collate_fn(self, samples, pair=True):

        batch = {}
        
        for key in ['input_ids', 'token_type_ids', 'attention_mask']:
            
            if pair:
            
                to_len = max([
                    max(len(sample[key][0]),len(sample[key][1])) 
                    for sample in samples])
                padded = pad_to_len_pair(
                    [sample[key] for sample in samples], to_len, 0
                )
                batch[key] = torch.tensor(padded)
            
            else:
                to_len = max([len(sample[key]) for sample in samples])
                padded = pad_to_len(
                    [sample[key] for sample in samples], to_len, 0
                )
                batch[key] = torch.tensor(padded)
            
        return batch

In [None]:
train_dataset  = dataset(X_train.reset_index(drop=True).transpose().to_dict())
valid_dataset   = dataset(X_valid.reset_index(drop=True).transpose().to_dict())
test_dataset   = dataset(X_test.reset_index(drop=True).transpose().to_dict())
submission_dataset = dataset(comments.transpose().to_dict())

In [None]:
BATCH_SIZE = 8

In [None]:
train_loader = DataLoader(
    dataset = train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers=8,
    pin_memory=True,
    collate_fn = lambda x: dataset.collate_fn(train_dataset, x)
)

valid_loader = DataLoader(
    dataset = valid_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers=8,
    pin_memory=True,
    collate_fn = lambda x: dataset.collate_fn(train_dataset, x)
)

test_loader = DataLoader(
    dataset = test_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers=8,
    pin_memory=True,
    collate_fn = lambda x: dataset.collate_fn(train_dataset, x)
)


test_loader = DataLoader(
    dataset = test_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers=8,
    pin_memory=True,
    collate_fn = lambda x: dataset.collate_fn(test_dataset, x, pair=False)
)


submission_loader = DataLoader(
    dataset = submission_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers=8,
    pin_memory=True,
    collate_fn = lambda x: dataset.collate_fn(submission_dataset, x, pair=False)
)

## Model

In [None]:
class BERT(torch.nn.Module):
    def __init__(self):
        super().__init__()        
        self.model = BertModel.from_pretrained("../input/huggingface-bert/bert-base-uncased")
        self.extractor  = torch.nn.Linear(768, 768)
        self.classifer  = torch.nn.Linear(768, 1)
        self.input_drop = torch.nn.Dropout(0.7)
        self.dropout    = torch.nn.Dropout(0.1)
        self.relu = torch.nn.ReLU() 
        
    def forward(self, ids, mask):
        output = self.model(ids, mask)
        CLS    = output.last_hidden_state[:,0,:]
        CLS    = self.input_drop(CLS)
        output = self.dropout(self.relu(self.extractor(CLS)))
        output = self.classifer(output)
        return output

In [None]:
class Estimator():
    def __init__(self, hyperparameters, device, model, optim = 'AdamW'):        
        self.params = hyperparameters
        self.model  = model
        self.device = device
        
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 
             'weight_decay': self.params['weight_decay']},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 
             'weight_decay': 0.0}
        ]
        
        if optim == 'Adam':
            self.optimizer = torch.optim.Adam(params=optimizer_grouped_parameters, 
                                              lr=self.params['learning_rate'])
        if optim == 'AdamW':
            self.optimizer = torch.optim.AdamW(params=optimizer_grouped_parameters, 
                                              lr=self.params['learning_rate'])
        self.model.to(self.device)
        
    def load_weight(self, weight_path):
        self.model.load_state_dict(torch.load(weight_path))
        

    def fit(self, data, save_name):

        ## Meta
        len_of_train = len(data['train'].dataset)
        len_of_test  = len(data['test'].dataset)
        best = 1000
        
        train_steps = int(len_of_train/self.params['batch_size']*self.params['epoch'])
        num_steps   = int(train_steps*0.1)

        scheduler = get_linear_schedule_with_warmup(self.optimizer, num_steps, train_steps)

        for epoch in range(self.params['epoch']):
            total_loss = 0
            total_acc  = 0
            total_val_acc  = 0
            total_val_loss = 0
            
            self.model.train()
            for step, batch in enumerate(tqdm.tqdm(data['train'])):
                ## INPUT
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                
                batch_size, _, max_length = input_ids.size()
                
                input_ids = input_ids.reshape(batch_size*2, max_length)
                attention_mask = attention_mask.reshape(batch_size*2, max_length)

                ## FOWARD
                output = self.model(input_ids, attention_mask)
                output = output.reshape(batch_size, 2)
                pos_score = output[:, 1]
                neg_score = output[:, 0]
                acc  = pos_score > neg_score
                loss = torch.mean(torch.max(torch.zeros_like(pos_score), torch.ones_like(pos_score)-pos_score+neg_score))
                total_acc+=acc.sum().float().item()
                total_loss+=loss.item()*len(input_ids)

                ## OPTIMIZE
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                scheduler.step() # Update learning rate schedule

            self.model.eval()
            for batch in data['test']:
                with torch.no_grad():
                    ## INPUT
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    
                    batch_size, _, max_length = input_ids.size()

                    input_ids = input_ids.reshape(batch_size*2, max_length)
                    attention_mask = attention_mask.reshape(batch_size*2, max_length)

                    ## FOWARD
                    output = self.model(input_ids, attention_mask)
                    output = output.reshape(batch_size, 2)
                    pos_score = output[:, 1]
                    neg_score = output[:, 0]
                    acc  = pos_score > neg_score
                    loss = torch.mean(torch.max(torch.zeros_like(pos_score), torch.ones_like(pos_score)-pos_score+neg_score))
                    total_val_acc+=acc.sum().float().item()
                    total_val_loss+=loss.item()*len(input_ids)
                
            print(f'Epoch: {epoch}, Train Loss: {(total_loss/len_of_train)}, Train Acc: {(total_acc/len_of_train)}, \
            Test Loss: {(total_val_loss/len_of_test)}, Test Acc: {(total_val_acc/len_of_test)}')

            if total_val_loss/len_of_test < best:
                best = total_val_loss
                torch.save(self.model.state_dict(), f"{save_name}.pth")
                    
    def inference(self, data):
        pass

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
estimator = Estimator(
    hyperparameters = {
        'learning_rate': 1e-5,
        'epoch': 3,
        'batch_size': BATCH_SIZE,
        'weight_decay':1e-4,
    }, 
    device = device,
    model = BERT()
)

In [None]:
estimator.fit({
    'train': train_loader,
    'test':valid_loader
}, save_name='best_model')

In [None]:
estimator.model.load_state_dict(torch.load('./best_model.pth'))
estimator.model.eval()
outputs = []
for batch in tqdm.tqdm(submission_loader):
    with torch.no_grad():
        ## INPUT
        input_ids = batch['input_ids'].to(estimator.device)
        attention_mask = batch['attention_mask'].to(estimator.device)

        ## FOWARD
        output = estimator.model(input_ids, attention_mask)
        outputs+=output.tolist()

In [None]:
comments['score'] = outputs
comments['score'] = comments['score'].apply(lambda x: x[0])
submission['score'] = comments['score']

In [None]:
comments[['text', 'score']].sort_values('score', ascending=False).head(20)

In [None]:
comments[['text', 'score']].sort_values('score', ascending=False).tail(20)

In [None]:
submission.to_csv('./submission.csv', index=False)