# Get  Libraries


In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import sklearn
import transformers
from sklearn.model_selection import train_test_split
from transformers import AutoModel, BertTokenizer

In [None]:
import torch 
import torch.nn as nn

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print(f'Using device: {device}')

In [None]:
np.__version__, pd.__version__, sklearn.__version__, transformers.__version__

In [None]:
torch.__version__

# Set Configs

In [None]:
MODEL =  '../input/output-detox/model/'
TOKENIZER = '../input/output-detox/tokenizer/'

# Load Datasets

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
df_ruddit = pd.read_csv("../input/ruddit-jigsaw-dataset-combined-cleaned/toxic_train.csv")
df_ruddit = df_ruddit[['txt', 'offensiveness_score']].rename(columns={'txt': 'text',
                                                                'offensiveness_score':'y'})
df_ruddit['y'] = (df_ruddit['y'] - df_ruddit.y.min()) / (df_ruddit.y.max() - df_ruddit.y.min())

In [None]:
df_train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")

df_train['severe_toxic'] = df_train.severe_toxic * 1.2
df_train['obscene'] = df_train.obscene * 1.3
df_train['threat'] = df_train.threat * 1.4
df_train['insult'] = df_train.insult * 1.5
df_train['identity_hate'] = df_train.identity_hate * 1.6 

df_train['y'] = (df_train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
df_train['y'] = df_train['y']/df_train['y'].max()

df_train = df_train[['comment_text', 'y']].rename(columns={'comment_text': 'text'})


In [None]:
df_total_train = pd.concat([df_ruddit, df_train])
del df_ruddit
df_total_train.shape

In [None]:
df_total_train = df_total_train.sample(200000,random_state=42)

# Import Detofixy Model and Tokenizer

In [None]:
model = AutoModel.from_pretrained(MODEL)
tokenizer = BertTokenizer.from_pretrained(TOKENIZER)

# Tokenization

In [None]:
# get length of all the messages in the train set
seq_len = [len(i.split()) for i in df_total_train["text"]]
pd.Series(seq_len).hist(bins = 30)

In [None]:
max_seq_len = 150

In [None]:
# tokenize and encode sequences in the training and val set
tokens_train = tokenizer.batch_encode_plus(
    df_total_train["text"].tolist(),
    padding = 'max_length',
    max_length = max_seq_len,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the real validation set
val_less_toxic_tokens = tokenizer.batch_encode_plus(
    df_val["less_toxic"].tolist(),
    padding = 'max_length',
    max_length = max_seq_len,
    truncation=True,
    return_token_type_ids=False
)
val_more_toxic_tokens = tokenizer.batch_encode_plus(
    df_val["more_toxic"].tolist(),
    padding = 'max_length',
    max_length = max_seq_len,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
sub_tokens = tokenizer.batch_encode_plus(
    df_sub["text"].tolist(),
    padding = 'max_length',
    max_length = max_seq_len,
    truncation=True,
    return_token_type_ids=False
)

# Convert Integer Sequences to Tensors

In [None]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(df_total_train["y"].tolist())

# for validation set
val_more_toxic_seq = torch.tensor(val_more_toxic_tokens['input_ids'])
val_more_toxic_mask = torch.tensor(val_more_toxic_tokens['attention_mask'])
val_less_toxic_seq = torch.tensor(val_less_toxic_tokens['input_ids'])
val_less_toxic_mask = torch.tensor(val_less_toxic_tokens['attention_mask'])
target = torch.tensor(1, dtype=torch.long)

# for test set
sub_seq = torch.tensor(sub_tokens['input_ids'])
sub_mask = torch.tensor(sub_tokens['attention_mask'])

# Create DataLoaders


In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# same 
val_data = TensorDataset(val_more_toxic_seq, val_more_toxic_mask,val_less_toxic_seq, val_less_toxic_mask)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

# test
sub_data = TensorDataset(sub_seq,sub_mask)
sub_sampler = SequentialSampler(sub_data)
sub_dataloader = DataLoader(sub_data,sampler = sub_sampler, batch_size=batch_size)

# Freeze model Parameters


In [None]:
# freeze all the parameters
for param in model.parameters():
    param.requires_grad = False

# Define Model Architecture


In [None]:
class myDetox(nn.Module):

    def __init__(self, model):
        super(myDetox, self).__init__()
        self.model = model 
        
        # dropout layer
        self.dropout = nn.Dropout(0.2)

        # dense layer 1
        self.fc1 = nn.Linear(768,512)

        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512,1)

    #define the forward pass
    def forward(self, sent_id, mask):
        #pass the inputs to the model  
        _, cls_hs = self.model(sent_id, attention_mask=mask, return_dict=False)

        x = self.fc1(cls_hs)

        x = self.dropout(x)

        # output layer
        x = self.fc2(x)

        return x

In [None]:
# pass the pre-trained BERT to our define architecture
finedmodel = myDetox(model)

# push the model to GPU
finedmodel = finedmodel.to(device)

In [None]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(finedmodel.parameters(), lr = 1e-3)

In [None]:
 MSE_loss = nn.MSELoss()

In [None]:
def valCrossLoss(outputs1, outputs2, targets):
    return nn.MarginRankingLoss(margin=0)(outputs1, outputs2, targets)

# Define training step and evaluation step

In [None]:
# function to train the model
def train():

    finedmodel.train()

    total_loss = 0

    # iterate over batches
    for step,batch in enumerate(train_dataloader):

        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        # push the batch to gpu
        batch = [r.to(device) for r in batch]

        sent_id, mask, labels = batch

        # clear previously calculated gradients 
        finedmodel.zero_grad()        

        # get model predictions for the current batch
        preds = finedmodel(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = MSE_loss(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(finedmodel.parameters(), 1.0)

        # update parameters
        optimizer.step()
        
        for r in batch:
            del r
    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    #returns the loss and predictions
    return avg_loss

In [None]:
# function for evaluating the model
def retrain():
  
    print("\nEvaluating... well retrain...")

    # deactivate dropout layers
    finedmodel.eval()

    total_loss = 0

    # empty list to save the model predictions
    total_acc = 0

    # iterate over batches
    for step,batch in enumerate(val_dataloader):

        # Progress update every 50 batches.
        if step % 1000 == 0 and not step == 0:

          # Calculate elapsed time in minutes.
          #elapsed = format_time(time.time() - t0)

          # Report progress.
          print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        more_toxic_ids, more_toxic_mask, less_toxic_ids, less_toxic_mask = batch

        # deactivate autograd
        #with torch.no_grad():

        # model predictions
        more_toxic_outputs = finedmodel(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = finedmodel(less_toxic_ids, less_toxic_mask)

        batch_size = more_toxic_ids.size(0)
        targets = torch.ones(batch_size,device = device, dtype = torch.long)

        loss = valCrossLoss(more_toxic_outputs, less_toxic_outputs, targets)
        
        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(finedmodel.parameters(), 1.0)

        # update parameters
        optimizer.step()
        
        total_loss = total_loss + (loss.item() * batch_size)

        more_toxic_outputs = more_toxic_outputs.detach().cpu().numpy()
        less_toxic_outputs = less_toxic_outputs.detach().cpu().numpy()
        
        total_acc += np.round((less_toxic_outputs < more_toxic_outputs).mean(),3)
        del more_toxic_outputs, less_toxic_outputs
        for r in batch:
            del r
            
    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 
    total_acc = total_acc / len(val_dataloader) 
    return avg_loss, total_acc

In [None]:
# function for evaluating the model
def evaluate():
  
    print("\nEvaluating...")

    # deactivate dropout layers
    finedmodel.eval()

    total_loss = 0

    # empty list to save the model predictions
    total_acc = 0

    # iterate over batches
    for step,batch in enumerate(val_dataloader):

        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:

          # Calculate elapsed time in minutes.
          #elapsed = format_time(time.time() - t0)

          # Report progress.
          print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        more_toxic_ids, more_toxic_mask, less_toxic_ids, less_toxic_mask = batch

        # deactivate autograd
        with torch.no_grad():

            # model predictions
            more_toxic_outputs = finedmodel(more_toxic_ids, more_toxic_mask)
            less_toxic_outputs = finedmodel(less_toxic_ids, less_toxic_mask)

            #batch_size = more_toxic_ids.size(0)
            #targets = torch.ones(batch_size,device = device, dtype = torch.long)

            #loss = valCrossLoss(more_toxic_outputs, less_toxic_outputs, targets)
            #total_loss = total_loss + (loss.item() * batch_size)

            more_toxic_outputs = more_toxic_outputs.detach().cpu().numpy()
            less_toxic_outputs = less_toxic_outputs.detach().cpu().numpy()
        
        total_acc += np.round((less_toxic_outputs < more_toxic_outputs).mean(),3)
        del more_toxic_outputs, less_toxic_outputs
        for r in batch:
            del r
            
    # compute the validation loss of the epoch
    #avg_loss = total_loss / len(val_dataloader) 
    total_acc = total_acc / len(val_dataloader) 
    return  total_acc

# Start Model Training


In [None]:
epochs = 3
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]
best_acc = 0
#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, acc = retrain()
    
    #evaluate model
    #acc = evaluate()
    
    #save the best model
    if acc > best_acc:
        best_acc = acc
        torch.save(finedmodel.state_dict(), 'saved_weights'+str(epoch)+'.pt')
        torch.save(finedmodel.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    #train_losses.append(train_loss)
    #valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    #print(f'Validation Loss: {valid_loss:.3f}')
    print(f'Validation ACC: {acc:.3f}')

In [None]:
#load weights of best model
path = 'saved_weights.pt'
finedmodel.load_state_dict(torch.load(path))

# Get Predictions for Real Validation Data and Submission Data

In [None]:
# get predictions for test data
#with torch.no_grad():
#    preds_more = finedmodel(val_more_toxic_seq.to(device), val_more_toxic_mask.to(device))
#    preds_more = preds_more.detach().cpu().numpy()
    
#    preds_less = finedmodel(val_less_toxic_seq.to(device), val_less_toxic_mask.to(device))
#    preds_less = preds_less.detach().cpu().numpy()

In [None]:
def accuracy(l,m):
    print(f'Validation Accuracy is { np.round((l < m).mean() * 100,2)}')

In [None]:
all_preds = []
with torch.no_grad():
    for step,batch in enumerate(sub_dataloader):
        # Progress update every 50 batches.
        if step % 1000 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(sub_dataloader)))
        sub_seq, sub_mask = batch
        preds_sub = finedmodel(sub_seq.to(device), sub_mask.to(device))
        preds_sub_ = preds_sub.detach().cpu().numpy()
        next_results = [a[0] for a in preds_sub_.tolist()]
        all_preds += next_results
        del sub_seq, sub_mask, preds_sub

In [None]:
df_scores = pd.DataFrame({"score":all_preds, "comment_id":df_sub["comment_id"]},)

In [None]:
df_scores.to_csv("submission.csv", index=False)