# Imports

In [None]:
pip install transformers

In [None]:
import torch
import numpy as np
from transformers import BertTokenizer
import pandas as pd
from torch.optim import Adam
from tqdm import tqdm
from torch import nn
from transformers import BertModel
from torchsummary import summary
from sklearn.metrics import roc_auc_score

# Tokenizer, Dataset, BERT Model

In [None]:
# use the pre-trained tokenizer from huggingface library
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#set an index to every label
labels = {'toxic' : 0, 
          'severe_toxic' : 1, 
          'obscene' : 2,
          'threat' : 3,
          'insult' : 4,
          'identity_hate' : 5}

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# dataset to load training or test data

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):
        # get labels from the dataframe
        self.labels = df[['toxic', 
                          'severe_toxic', 
                          'obscene',
                          'threat',
                          'insult',
                          'identity_hate']].to_numpy(dtype=np.float32)
                          
        # tokenize every text from the dataframe
        self.texts = [tokenizer(text,
                                padding='max_length', 
                                max_length = 512, 
                                truncation=True,
                                return_tensors="pt") for text in df['comment_text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [None]:
# BERT model to be trained

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        # BERT-small model
        self.bert = BertModel.from_pretrained('prajjwal1/bert-small')
        
        # additional fully-connected layer for classification
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(512, 6)
        self.sigm = nn.Sigmoid()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.sigm(linear_output)

        return final_layer

# Make Training Dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/NLP/bert/data/train_clean.csv')
train_dataset = Dataset(df)
torch.save(train_dataset,'/content/drive/MyDrive/NLP/bert/data/pytorch_train_dataset.pt')

In [None]:
#train_dataset = torch.load('/content/drive/MyDrive/NLP/bert/data/pytorch_train_dataset.pt')

# Train Model

In [None]:
# training loop

def train(model, train, learning_rate, epochs):

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0
            i=0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device).float()
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
                if i%10==0:
                  print(batch_loss.item())
                i+=1
            torch.save(model,'/content/drive/MyDrive/NLP/bert/bert_model_epoch_'+str(epoch_num)+'.pt')

In [None]:
# training
epochs = 4
model = BertClassifier()
learning_rate = 2e-5
              
train(model, train_dataset, learning_rate, epochs)

#Testing Model

In [None]:
test_dataset = torch.load('/content/drive/MyDrive/NLP/bert/data/pytorch_test_dataset.pt')

In [None]:
model = torch.load('/content/drive/MyDrive/NLP/bert/bert_model_epoch_2.pt')

In [None]:
# Put model in evaluation mode
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model.eval()

test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for test_input, test_label in tqdm(test_dataloader):
  with torch.no_grad():
    test_label = test_label.to(device).float()
    mask = test_input['attention_mask'].to(device)
    input_id = test_input['input_ids'].squeeze(1).to(device)
    outputs = model(input_id, mask)
    
  logits = outputs.data
  
  # Store predictions and true labels
  logits = logits.detach().cpu().numpy()
  label_ids = test_label.to('cpu').numpy()
  predictions.append(logits)
  true_labels.append(label_ids)

print('DONE.')

100%|██████████| 2000/2000 [03:38<00:00,  9.15it/s]

DONE.





In [None]:
# reshape predictions and true labels to a single matrix
true_labels = np.concatenate(true_labels,axis=0)
predictions = np.concatenate(predictions,axis=0)

In [None]:
# Compute mean ROC/AUC score
roc_auc_score(true_labels, predictions)

0.9822168302204584