In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json
import os

In [None]:
base_dir1 = '/kaggle/input/jigsaw-toxic-comment-classification-challenge'
base_dir2 = '/kaggle/input/jigsaw-toxic-severity-rating'
base_dir3 = '/kaggle/input/ruddit-jigsaw-dataset/'
df = pd.read_csv(os.path.join(base_dir1,'train.csv'))

cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in cat_mtpl:
    df[category] = df[category] * cat_mtpl[category]

df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(float)
df['y'] = df['y']/df['y'].max()

df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(10)

In [None]:
n_folds = 5
# sample the traning data
def sample_train(df, fold):
    frac_1 = 0.5
    frac_1_factor = 1.2
    tmp_df = pd.concat([df[df.y>0].sample(frac=frac_1, random_state = fold) , 
                        df[df.y==0].sample(n=int(len(df[df.y>0])*frac_1*frac_1_factor) , 
                        random_state = 0)], axis=0).sample(frac=1, random_state = fold)
    return tmp_df

In [None]:
for fld in range(n_folds):
    df_sub=sample_train(df,fld)
    df_sub.to_csv(f'df_fld{fld}.csv', index=False)
    print(f"df_fld{fld}.csv","created")

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import transformers
from tqdm import tqdm

In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
LEARNING_RATE = 1e-5
WEIGHT_DECAY=1e-6
EPS=1e-6
DROPOUT=0.1
TMAX=10
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')

In [None]:
class ToxicData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.y
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
sub_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = AutoModel.from_pretrained('../input/roberta-base')
        self.layer_norm = torch.nn.LayerNorm(768)
        self.dropout = torch.nn.Dropout(DROPOUT)
        self.dense = torch.nn.Sequential(
            torch.nn.Linear(768, 256),
            torch.nn.LeakyReLU(negative_slope=0.01),
            torch.nn.Dropout(DROPOUT),
            torch.nn.Linear(256, 1)
        )

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooled_output = self.layer_norm(pooler)
        pooled_output = self.dropout(pooled_output)
        preds = self.dense(pooled_output)
        return preds   

In [None]:
loss_function = torch.nn.MSELoss()

In [None]:
def train(model,epoch,training_loader,optimizer):
    tr_loss = 0
    nb_tr_steps = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask)
        
        targets = targets[:,None]
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        nb_tr_steps += 1
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    epoch_loss = tr_loss/nb_tr_steps
    print(f"Training Loss Epoch {epoch}: {epoch_loss}")

    return 

In [None]:
def valid(model, epoch, testing_loader):
    model.eval()
    total = 0; tr_loss=0; nb_tr_steps=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            targets=targets[:,None]
            outputs = model(ids, mask)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            nb_tr_steps += 1

    epoch_loss = tr_loss/nb_tr_steps
    print(f"Validation Loss Epoch{epoch}: {epoch_loss}")
    
    return epoch_loss

In [None]:
def predict(model, testing_loader):
    model.eval()
    preds=None
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            
            outputs = model(ids, mask)
            if(preds==None):
                preds=outputs
            else:
                preds=torch.cat([preds,outputs],dim=0)
    
    return preds

In [None]:
df_test=pd.read_csv(os.path.join(base_dir2,"comments_to_score.csv"))
df_test=df_test.drop(['comment_id'],axis=1)
df_test['y']=0
test_preds_bert = np.zeros((df_test.shape[0], n_folds))

In [None]:
for fld in range(n_folds):
    df_all = pd.read_csv(f"df_fld{fld}.csv")
    train_size = 0.9
    train_data=df_all.sample(frac=train_size,random_state=200)
    test_data=df_all.drop(train_data.index).reset_index(drop=True)
    train_data = train_data.reset_index(drop=True)

    print("FULL Dataset: {}".format(df_all.shape))
    print("TRAIN Dataset: {}".format(train_data.shape))
    print("TEST Dataset: {}".format(test_data.shape))
    print("SUBMISSION dataset: {}".format(df_test.shape))

    training_set = ToxicData(train_data, tokenizer, MAX_LEN)
    testing_set = ToxicData(test_data, tokenizer, MAX_LEN)
    submission_set = ToxicData(df_test,tokenizer,MAX_LEN)

    training_loader = DataLoader(training_set, **train_params)
    testing_loader = DataLoader(testing_set, **test_params)
    submission_loader = DataLoader(submission_set,**sub_params)
    
    model = RobertaClass()
    model.to(device)
    
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE,weight_decay=WEIGHT_DECAY,eps=EPS)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TMAX)
    EPOCHS = 10
    best_score=12345
    patience=1
    cur_p=0
    for epoch in range(EPOCHS):
        train(model,epoch,training_loader,optimizer)
        val_loss=valid(model,epoch,testing_loader)
        if(val_loss<best_score):
            best_score=val_loss
            cur_p=0
        else:
            cur_p+=1
        if(cur_p>patience):
            break
        scheduler.step()
    preds=predict(model,submission_loader)
    preds=preds.cpu().detach().numpy() 
    test_preds_bert[:,fld] = preds[:,0]
    del model

In [None]:
test_preds_bert

In [None]:
submission=pd.read_csv(os.path.join(base_dir2,"sample_submission.csv"))
submission['score']=np.mean(test_preds_bert,axis=1)
submission.to_csv("submission.csv", index=False)