In [None]:
import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
from torch.nn.modules.loss import _WeightedLoss
import scipy as sp
import numpy as np
import pandas as pd
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import StratifiedKFold

from tqdm.auto import tqdm
from functools import partial

import cv2
import gc
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
import random
import warnings 
warnings.filterwarnings('ignore')
from transformers import RobertaModel, RobertaTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from transformers import AdamW, AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup


In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything(seed=2019)

In [None]:
PATH = '../input/jigsaw-toxic-comment-classification-challenge'
test = pd.read_csv(os.path.join(PATH, 'test.csv.zip'))

In [None]:
sub = pd.read_csv(os.path.join(PATH, 'sample_submission.csv.zip'))

In [None]:
sub.shape

In [None]:
class CFG:
    debug=False
    apex=False
    print_freq=100
    num_workers=4
    model_name='roberta-base' #'swin_large_patch4_window7_224'
    size=224
    scheduler='CosineAnnealingWarmRestarts' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']
    epochs=5
    #factor=0.2 # ReduceLROnPlateau
    #patience=4 # ReduceLROnPlateau
    #eps=1e-6 # ReduceLROnPlateau
    T_max=10 # CosineAnnealingLR
    opt_wd_non_norm_bias = 0.01
    opt_wd_norm_bias = 0 # same as Adam in Fastai
    opt_beta1 = 0.9
    opt_beta2 = 0.99 # same as Adam in Fastai
    opt_eps = 1e-5 # same as Adam in Fastai
    T_0=10 # CosineAnnealingWarmRestarts
    lr=1e-4
    min_lr=1e-6
    batch_size=32
    weight_decay=1e-6
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=2019
    target_size=1
    n_fold=5

In [None]:
MAX_LEN = 128
tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')

In [None]:
def inference_fn(model, dataloader):
    model.eval()
    predictions = [] 
    with torch.no_grad():
        for data in dataloader:
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            
            output = model(ids, mask, token_type_ids)
            predictions.append(output.cpu().detach().numpy())
            
    return np.concatenate(predictions)

In [None]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.comment_text.values
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7

        config = AutoConfig.from_pretrained('../input/roberta-base')

        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
                "num_labels": 6,
            }
        )
        self.transformer = AutoModel.from_pretrained('../input/roberta-base', config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Linear(config.hidden_size, 6)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        transformer_out = self.transformer(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = transformer_out[0]
        sequence_output = self.dropout(torch.mean(sequence_output, 1))
        logits1 = self.output(self.dropout1(sequence_output))
        logits2 = self.output(self.dropout2(sequence_output))
        logits3 = self.output(self.dropout3(sequence_output))
        logits4 = self.output(self.dropout4(sequence_output))
        logits5 = self.output(self.dropout5(sequence_output))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return logits

In [None]:
def run_fold(df, model, fold, MODEL_PATH):    
    
    test_dataset = SentimentData(df, tokenizer, MAX_LEN) 

    test_loader = DataLoader(test_dataset, 
                              batch_size=CFG.batch_size * 2, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    
    model.load_state_dict(torch.load(f"{MODEL_PATH}/toxicx_model_{fold}.pth", map_location=device), strict=False)
    model.to(device)
    
    predictions = np.zeros((df.shape[0], 6))
    predictions = inference_fn(model, test_loader)
    

    return predictions

In [None]:
def run_k_fold(NFOLDS, df, model, MODEL_PATH):
    predictions = np.zeros((df.shape[0], 6))
    
    for fold in range(NFOLDS):
        pred_ = run_fold(df, model, fold, MODEL_PATH)
        
        predictions += pred_/ NFOLDS
        
    return predictions

In [None]:
def TTA_Wrapper(df, tta, model, NFOLDS, MODEL_PATH):
    predictions = np.zeros((df.shape[0], 6))
    for tta_id in range(tta):
        predictions_ = run_k_fold(NFOLDS, df,  model, MODEL_PATH)
        predictions += predictions_/ tta
        print(f'TTA {tta_id}')
    return predictions

In [None]:
ENSEMBLE = ['../input/roberta-toxic-classification/', ]

In [None]:
if __name__ == '__main__':
    
    model = RobertaClass()
    predicted = TTA_Wrapper(test, 1 , model, 5, ENSEMBLE[0])

In [None]:
sub[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult','identity_hate']] = predicted

In [None]:
sub.to_csv('submission.csv', index=False)