In [None]:
class CFG:
    debug=False
    seed=42
    n_fold = 4
    model_name = "../input/roberta-base-edited"
    max_len = 256
    text="text"
    target="target"
    target_size = 1
    hidden_size = 768
    fc_dropout = 0.
    print_freq=50
    n_accumulate = 1
    batch_size = 32
    num_workers = 4
    no_decay = True 
    weight_decay = 0.
    lr = 1e-5
    scheduler = "cosine"
    num_cycles = 1
    num_warmup_steps = 80
    epochs = 4

In [None]:
import os
import gc
import re
import gc
import time
import math
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import OneCycleLR
# NLP
from transformers import AutoTokenizer, AutoModel,get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

from bs4 import BeautifulSoup

pd.set_option("max_columns",100)

In [None]:
# Random Seed Initialize
RANDOM_SEED = 42

def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything()

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    
print(f'Using device: {device}')

## Data Loading

In [None]:
train_1st_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
train_1st_test_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv')
train_1st_test_lb_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv')

validation_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
train_1st_test_df = train_1st_test_df.merge(train_1st_test_lb_df, on='id')

drop_idx = (train_1st_test_df.loc[:,"toxic":]==-1).sum(axis = 1) >=1
train_1st_test_df = train_1st_test_df[~drop_idx].reset_index(drop=True)

train_1st_df = pd.concat([train_1st_df,train_1st_test_df]).reset_index(drop = True)

if CFG.debug:
    train_1st_df =  train_1st_df.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)

In [None]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

In [None]:
# https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-train/notebook
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [None]:
def create_fold(input_df):
    Fold = KFold(n_splits=CFG.n_fold,random_state=CFG.seed,shuffle=True)
    outdf = input_df.copy()
    for n, (trn_index, val_index) in enumerate(Fold.split(outdf)):
        outdf.loc[val_index, 'fold'] = int(n)
    outdf['fold'] = outdf['fold'].astype(int)
    display(outdf.groupby('fold').size())
    return outdf

In [None]:
class JigsawDataset(Dataset):
    def __init__(self, CFG, input_df, is_train=True):
        self.CFG = CFG
        self.is_train = is_train 
        self.text = input_df[self.CFG.text].values
        self.tokenizer = AutoTokenizer.from_pretrained(self.CFG.model_name)
        if self.is_train:
            self.labels = input_df[self.CFG.target].values       
             
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        text =  self.text[idx]
        encoded = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.CFG.max_len,
            padding='max_length'
        )
        input_ids = torch.tensor(encoded['input_ids'])
        attention_mask = torch.tensor(encoded['attention_mask'])
        
        if self.is_train:
            label = torch.tensor(self.labels[idx])
            return input_ids, attention_mask, label
        return input_ids, attention_mask

In [None]:
class JigsawModel(nn.Module):
    def __init__(self, CFG):
        super().__init__()
        self.CFG = CFG
        self.model = AutoModel.from_pretrained(self.CFG.model_name)
        self.fc_dropout = nn.Dropout(self.CFG.fc_dropout)
        self.fc = nn.Linear(self.CFG.hidden_size, self.CFG.target_size)
    
    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids = input_ids, 
                         attention_mask = attention_mask)
        
        out = self.fc_dropout(out[1])
        outputs = self.fc(out)
        return outputs

In [None]:
class AverageMeter(object):   
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
# utils
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [None]:
def get_scheduler(CFG, optimizer, num_train_steps):
    # https://huggingface.co/docs/transformers/main_classes/optimizer_schedules#transformers.get_cosine_schedule_with_warmup
    if CFG.scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=CFG.num_warmup_steps, 
            num_training_steps=num_train_steps
        )
    elif CFG.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=CFG.num_warmup_steps,
            num_training_steps=num_train_steps, 
            num_cycles=CFG.num_cycles
        )
    return scheduler

In [None]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    start = end = time.time()
    losses = AverageMeter()
    
    model.train()
    
    for step, (input_ids, attention_mask, labels) in enumerate(train_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device).reshape(-1, 1).float()
        batch_size = labels.size(0)
        y_preds = model(input_ids, attention_mask)

        
        loss = criterion(y_preds, labels)
        
        losses.update(loss.item(), batch_size)
        loss.backward()
        
        if (step + 1) % CFG.n_accumulate == 0:
            optimizer.step()
            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
        
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print(
                f"Epoch:[{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed:{timeSince(start, float(step + 1) / len(train_loader))} "
                f"Loss:{losses.avg:.4f} "
                f"LR:{scheduler.get_lr()[0]:.8f}"
            )
    return losses.avg

In [None]:
def valid_fn(valid_loader, model, criterion, epoch, device):
    start = end = time.time()
    losses = AverageMeter()
    
    model.eval()
    preds = []
    
    for step, (input_ids, attention_mask, labels) in enumerate(valid_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device).reshape(-1, 1).float()
        batch_size = labels.size(0)
        
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask)
        
        loss = criterion(y_preds, labels)
        losses.update(loss.item(), batch_size)
        
        preds.append(y_preds.to("cpu").numpy())
        
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print(
                f"Epoch:[{epoch + 1}][{step}/{len(valid_loader)}] "
                f"Elapsed:{timeSince(start, float(step + 1) / len(valid_loader))} "
                f"Loss:{losses.avg:.4f} "
            )
            
    predictions = np.concatenate(preds)
    
    return losses.avg, predictions

In [None]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    
    for step, (input_ids, attention_mask) in tqdm(enumerate(test_loader), total = len(test_loader)):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask)
        preds.append(y_preds.to("cpu").numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
def train_loop(train_df, fold):
    LOGGER.info(f"========== fold: {fold} training ==========")
    # ====================================================
    # Data Loader
    # ====================================================
    trn_idx = train_df[train_df["fold"] != fold].index
    val_idx = train_df[train_df["fold"] == fold].index
    
    train_folds = train_df.loc[trn_idx].reset_index(drop=True)
    valid_folds = train_df.loc[val_idx].reset_index(drop=True)
    
    train_dataset = JigsawDataset(CFG, train_folds, is_train=True)
    valid_dataset = JigsawDataset(CFG, valid_folds, is_train=True)
    
    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, 
                              pin_memory=True, 
                              drop_last=False)
    
    # ====================================================
    # model & optimizer
    # ====================================================
    model = JigsawModel(CFG)
    model = model.to(device)
    
    if CFG.no_decay:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': CFG.weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=CFG.lr)
    else:
        optimizer = AdamW(model.parameters(), lr=CFG.lr)
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)
    
    criterion = nn.MSELoss() 
    
    # ====================================================
    # Loop
    # ====================================================
    
    best_score = np.inf 
    best_loss = np.inf 
    
    for epoch in range(CFG.epochs):
        start_time = time.time()
        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)
        # eval
        print("eval start")
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, epoch, device)
        
        elapsed = time.time() - start_time
        LOGGER.info(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s")
        
        if avg_val_loss<= best_loss:
            best_loss = avg_val_loss
            
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_loss:.4f} Model")
            torch.save(
                {"model": model.state_dict(),
                 "preds": preds
                }, 
                os.path.join(OUTPUT_DIR+f"{CFG.model_name.split('/')[-1]}_fold{fold}_best.pth")
            )
    preds = torch.load(OUTPUT_DIR+f"{CFG.model_name.split('/')[-1]}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))['preds']
    valid_folds["preds"] = preds
    
    del model,train_loader,valid_loader
    gc.collect()
    torch.cuda.empty_cache()

    return valid_folds

In [None]:
def main():
    # Training
    oof_df = pd.DataFrame()
    for fold in range(CFG.n_fold):
    #for fold in range(1):
        _oof_df = train_loop(train_df, fold)
        oof_df = pd.concat([oof_df, _oof_df])
        LOGGER.info(f"========== fold: {fold} result ==========")
    # CV result
    LOGGER.info(f"========== CV ==========")
    return oof_df

In [None]:
txt_validation_df = pd.concat([validation_df["less_toxic"],validation_df["more_toxic"]]) 
txt_1st =train_1st_df["comment_text"] 

venn2(subsets=(set(txt_validation_df), set(txt_1st)),set_labels=("validation_df", "1st"))

In [None]:
print(f"Before:{len(train_1st_df)}")
val_text_list = list(set(txt_validation_df))
dup_idx = train_1st_df["comment_text"].isin(val_text_list)
print(f"Num_duplicate_text_{sum(dup_idx)}")
train_1st_df = train_1st_df[~dup_idx].reset_index(drop=True)
print(f"After:{len(train_1st_df)}")

In [None]:
txt_validation_df = pd.concat([validation_df["less_toxic"],validation_df["more_toxic"]]) 
txt_1st =train_1st_df["comment_text"] 
venn2(subsets=(set(txt_validation_df), set(txt_1st)),set_labels=("validation_df", "1st"))

In [None]:
del txt_validation_df,txt_1st, val_text_list
gc.collect()

In [None]:
cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in cat_mtpl:
    train_1st_df[category] = train_1st_df[category] * cat_mtpl[category]

In [None]:
train_1st_df["target"] = train_1st_df.loc[:, "toxic":"identity_hate"].sum(axis=1)

In [None]:
train_1st_df["target"].hist(bins = 10)
train_1st_df["target"].describe()

In [None]:
len(train_1st_df[train_1st_df["target"]==0])/len(train_1st_df)

In [None]:
frac = 0.05
zero_sample_df = train_1st_df[train_1st_df["target"]==0].sample(frac = frac,random_state = 1)
non_zero_df = train_1st_df[train_1st_df["target"]!=0]
train_df = pd.concat([zero_sample_df,non_zero_df]).sort_index().reset_index(drop=True)

# sampling_result
print(f"before_len:{len(train_1st_df)}")
print(f"after_len:{len(train_df)}")
print(f"Ratio :{len(train_df)/len(train_1st_df)}")

del train_1st_df
gc.collect()

In [None]:
train_df["target"].hist(bins = 40)
train_df["target"].describe()

In [None]:
train_df = train_df.rename(columns = {"comment_text":"text"})
train_df["text"] = train_df["text"].apply(lambda x:text_cleaning(x))
train_df = create_fold(train_df)

In [None]:
if __name__ == "__main__":
    oof_df = main()

In [None]:
if CFG.debug:
    validation_df = validation_df.sample(n=100, random_state=CFG.seed).reset_index(drop=True)

In [None]:
validation_df.head()

In [None]:
whole_unique_sentence = validation_df["less_toxic"].append(validation_df["more_toxic"]).unique()
sentence_master_dict = {_:i for i, _ in enumerate(whole_unique_sentence)}
validation_df["less_toxic_id"] = validation_df["less_toxic"].map(sentence_master_dict)
validation_df["more_toxic_id"] = validation_df["more_toxic"].map(sentence_master_dict)
del whole_unique_sentence
gc.collect()

In [None]:
val_df_for_pred = pd.DataFrame(data = {"id":sentence_master_dict.values(),
                                       "text":sentence_master_dict.keys()
                                      })
del sentence_master_dict
gc.collect()

In [None]:
val_df_for_pred["text"] = val_df_for_pred["text"].apply(lambda x:text_cleaning(x))

In [None]:
val_dataset = JigsawDataset(CFG, val_df_for_pred, is_train = False)
val_loader = DataLoader(val_dataset, 
                        batch_size=CFG.batch_size,
                        shuffle=False,
                        num_workers=CFG.num_workers, 
                        pin_memory=True, 
                        drop_last=False)

In [None]:
predictions = []
for fold in range(CFG.n_fold):
    model = JigsawModel(CFG)
    state = torch.load("./"+f"{CFG.model_name.split('/')[-1]}_fold{fold}_best.pth", map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(val_loader, model, device)
    predictions.append(prediction)
    del model, state; gc.collect()
    torch.cuda.empty_cache()

In [None]:
val_df_for_pred["pred"] = np.mean(predictions, axis=0)

In [None]:
scoring_dict = val_df_for_pred.set_index("id")["pred"].to_dict()

In [None]:
validation_df["less_toxic_pred"] = validation_df["less_toxic_id"].map(scoring_dict)
validation_df["more_toxic_pred"] = validation_df["more_toxic_id"].map(scoring_dict)

In [None]:
validation_df["correct"] = (validation_df["less_toxic_pred"]  < validation_df["more_toxic_pred"]).astype(int)

In [None]:
validation_df["correct"].mean()