In [None]:
#! pip install transformers

In [None]:
import transformers
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [None]:
class CONFIG :
    MAX_LEN = 512

    TRAIN_BATCH_SIZE = 8
    VALID_BATCH_SIZE = 4
    TEST_BATCH_SIZE = 4

    EPOCHS = 1

    BERT_PATH = "../input/bertbaseuncased/"

    MODEL_PATH = "model3.bin"

    # training file, validationj file, test file

    TRAINING_FILE = "input/train_folds.csv"
    
    VALIDATION_FILE = "input/validation_data_cleaned.csv"
    
    TEST_FILE = "../input/jigsaw-toxic-severity-rating/comments_to_score.csv"

    # define the tokenizer

    TOKENIZER = transformers.BertTokenizer.from_pretrained(BERT_PATH, do_lower_case = True)


In [None]:
class BERTDataset :
    def __init__(self, comment, target) :
        """
        :param comment : list or numpy array of strings
        :param targets : list or numpy array which is binary
        """

        self.comment = comment
        self.target = target
        self.tokenizer = CONFIG.TOKENIZER
        self.max_len = CONFIG.MAX_LEN      
        
    def __len__(self) :
        # this returns the length of the dataset
        return len(self.comment)

    def __getitem__(self, item):
        # for a given item index, return a dictionary of inputs

        comment = str(self.comment[item])
        comment = " ".join(comment.split())

        inputs = self.tokenizer.encode_plus(comment, None, add_special_tokens = True, 
                                            max_length = self.max_len, pad_to_max_length = True,)
        # ids are ids of tokens generated
        ids = inputs["input_ids"]
        # mask is 1 where we have input and 0 where we have padding
        mask = inputs["attention_mask"]
        # token_type_ids behave the same way as mask , in case of 2 sentences this is 0 for first sentence and 1 for the first sentence
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids" : torch.tensor(ids, dtype = torch.long),

            "mask" : torch.tensor(mask, dtype = torch.long), 

            "token_type_ids" : torch.tensor(token_type_ids, dtype = torch.long), 

            "targets" : torch.tensor(self.target[item], dtype = torch.float)
        }
        

In [None]:
class BERTBaseUncased(nn.Module):

    def __init__(self) :
        super(BERTBaseUncased, self).__init__()
        # we fetch the model from the BERT_PATH defined in config_bert.py

        self.bert = transformers.BertModel.from_pretrained(CONFIG.BERT_PATH)

        # add a dropout for regularization
        self.bert_drop = nn.Dropout(0.3)
        # a simple linear layer for output
        self.out = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        # BERT in default settings return 2 outputs - last hidden state and output of bert pooler layer
        # we use the output of the pooler layer which is of size (batch_size, hidden_size)
        # hidden size will be 768 in this case

        _, o2 = self.bert(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict = False)

        # pass through the dropout layer
        bo = self.bert_drop(o2)
        # pass through the linear layer
        output = self.out(bo)

        return output

In [None]:
def loss_fn(outputs, targets):
    """
    This function returns loss
    """
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

In [None]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    """
    This is the training function which trains for one epoch
    """

    # put the model in training mode
    model.train()

    for d in data_loader :
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype = torch.long)
        token_type_ids = token_type_ids.to(device, dtype = torch.long)
        mask = mask.to(device, dtype = torch.long)
        targets = targets.to(device, dtype = torch.float)

        # zero-grad the optimizer
        optimizer.zero_grad()

        # pass through the model
        outputs = model(ids = ids, mask = mask, token_type_ids = token_type_ids)

        # calculate the loss
        loss = loss_fn(outputs, targets)

        # backward step the loss
        loss.backward()
        # step optimizer
        optimizer.step()
        # step scheduler
        scheduler.step()


def eval_fn(data_loader, model, device) :
    """this is the validatiuon function that generates prediction on validation data
    """

    # put the model in eval mode
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad() :
        for d in data_loader:
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype = torch.long)
            token_type_ids = token_type_ids.to(device, dtype = torch.long)
            mask = mask.to(device, dtype = torch.long)
            targets = targets.to(device, dtype = torch.float)

            # pass through the model
            outputs = model(ids = ids, mask = mask, token_type_ids = token_type_ids)

            targets = targets.cpu().detach()
            fin_targets.extend(targets.numpy().tolist())

            outputs = torch.sigmoid(outputs.cpu().detach())
            fin_outputs.extend(outputs.numpy().tolist())

    return fin_outputs, fin_targets

In [None]:
def train():

    """this function trains the model
    """

    df = pd.read_csv(CONFIG.TRAINING_FILE)

    fold = 3

    df_train = df[df.kfold != fold].reset_index(drop = True)

    df_valid = df[df.kfold == fold].reset_index(drop = True)
    # initialize BERT dataset from dataset.py
    # for training dataset
    train_dataset = BERTDataset(comment = df_train.cleaned_text, target=df_train.y)

    # create training dataloader
    train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size = CONFIG.TRAIN_BATCH_SIZE, num_workers = 4)

    # for validation dataset
    valid_dataset = BERTDataset(comment = df_valid.cleaned_text, target=df_valid.y)

    # create validation dataloader
    valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size = CONFIG.VALID_BATCH_SIZE, num_workers = 1)

    # initialize the cuda device
    device = torch.device("cuda")
    # load the model
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimized_parameters = [{
         "params" : [p for n,p in param_optimizer if not any(nd in n for nd in no_decay)], 

         "weight_decay" : 0.001, 
    }, 
    {
        "params" : [p for n,p in param_optimizer if any(nd in n for nd in no_decay)],

        "weight_decay" : 0.0,
    },]
    
    # calculate the no of training steps
    # this is used by scheduler
    num_train_steps = int(len(df_train) / CONFIG.TRAIN_BATCH_SIZE * CONFIG.EPOCHS)

    # AdamW optimizer
    optimizer = AdamW(optimized_parameters, lr = 3e-5)

    # fetch a scheduler 
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0,
                                                num_training_steps = num_train_steps)
    # start training the epochs
    best_accuracy = 0

    for epoch in range(CONFIG.EPOCHS) :
        train_fn(train_data_loader, model, optimizer, device, scheduler)

        outputs, targets = eval_fn(valid_data_loader, model, device)

        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print (f"Accuracy score: {accuracy}")

        torch.save(model.state_dict(), CONFIG.MODEL_PATH)

In [None]:
#train()

In [None]:
def get_predictions(checkpoint) : 
    """
    this function will get predictions from the validation file
    """
    
    df = pd.read_csv(CONFIG.VALIDATION_FILE)
    # filling labels for class BERTDataset
    df["y"] = -1 
    
    test_dataset_more_toxic = BERTDataset(comment = df.more_toxic, target = df.y)
    
    # creating test data loader more toxic
    test_data_loader_more_toxic = torch.utils.data.DataLoader(test_dataset_more_toxic , batch_size = CONFIG.VALID_BATCH_SIZE, num_workers = 1)
    
    test_dataset_less_toxic = BERTDataset(comment = df.less_toxic, target = df.y)
    
    #creating test data loader less toxic
    test_data_loader_less_toxic = torch.utils.data.DataLoader(test_dataset_less_toxic, batch_size = CONFIG.VALID_BATCH_SIZE, num_workers = 1)
    # initialize the cuda device
    device = torch.device("cuda")
    # load the model
    model = BERTBaseUncased()
    model.to(device)
    model.load_state_dict(torch.load(checkpoint))
    
    outputs_more_toxic, targets_more_toxic = eval_fn(test_data_loader_more_toxic, model, device)
    del targets_more_toxic
    
    outputs_less_toxic, targets_less_toxic = eval_fn(test_data_loader_less_toxic, model, device)
    del targets_less_toxic
    
    outputs_more_toxic_m = [j for i in outputs_more_toxic for j in i]
    outputs_less_toxic_m = [j for i in outputs_less_toxic for j in i]
    
    score = [i>j for i,j in zip(outputs_more_toxic_m, outputs_less_toxic_m)]
    
    accuracy = np.mean(score)
    
    #score = (outputs_more_toxic > outputs_less_toxic).mean()
    
    print (f"Accuracy on kaggle validation dataset is {accuracy}")
    
    #return outputs_more_toxic, outputs_less_toxic

In [None]:
#checkpoint = "model.bin"

#p1, p2 = get_predictions(checkpoint)

In [None]:
def get_predictions_test(checkpoint) : 
    """
    this function will get predictions from the test file
    """
    
    df = pd.read_csv(CONFIG.TEST_FILE)
    # filling labels for class BERTDataset
    df["y"] = -1 
    
    test_dataset = BERTDataset(comment = df.text, target = df.y)
    
    # creating test data loader more toxic
    test_data_loader = torch.utils.data.DataLoader(test_dataset , batch_size = CONFIG.TEST_BATCH_SIZE, num_workers = 1)
    
    
    # initialize the cuda device
    device = torch.device("cuda")
    # load the model
    model = BERTBaseUncased()
    model.to(device)
    model.load_state_dict(torch.load(checkpoint))
    
    outputs, targets = eval_fn(test_data_loader, model, device)
    
    outputs = [j for i in outputs for j in i]
    
    return outputs

In [None]:
checkpoint = "../input/bertbaseline5folds/model0.bin"
preds0 = get_predictions_test(checkpoint)

In [None]:
checkpoint1 = "../input/bertbaseline5folds/model.bin"
preds1 = get_predictions_test(checkpoint1)

checkpoint2 = "../input/bertbaseline5folds/model2.bin"
preds2 = get_predictions_test(checkpoint2)

checkpoint3 = "../input/bertbaseline5folds/model3.bin"
preds3 = get_predictions_test(checkpoint3)

checkpoint4 = "../input/bertbaseline5folds/model4.bin"
preds4 = get_predictions_test(checkpoint4)


In [None]:
final_preds = np.mean((preds0, preds1, preds2, preds3, preds4), axis = 0)

In [None]:
test_df = pd.read_csv(CONFIG.TEST_FILE)

In [None]:
test_df["score"] = final_preds

In [None]:
test_df["score"] = test_df["score"].rank(method = "first")

In [None]:
test_df[["comment_id", "score"]].to_csv("submission.csv", index = False)