In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, BertConfig, AdamW
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import transformers

In [6]:
import zipfile
with zipfile.ZipFile("../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip","r") as zip_ref:
    zip_ref.extractall("./")
    
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
trainset, valset = train_test_split(df, test_size = 0.2,random_state=42)

In [8]:
x_train = trainset["comment_text"].to_list()
y_train = trainset.iloc[:,2:].values

In [9]:
x_val = valset["comment_text"].tolist()
y_val = valset.iloc[:,2:].values

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [11]:
max_seq_length =256
train_encodings= tokenizer(x_train,
                             add_special_tokens=True,
                             max_length=max_seq_length,
                             truncation=True,
                             padding=True 
                             )
val_encodings = tokenizer(x_val,
                             add_special_tokens=True,
                             max_length=max_seq_length,
                             truncation=True,
                             padding=True 
                             )

In [12]:
class MultilabelDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MultilabelDataset(train_encodings, y_train)
val_dataset = MultilabelDataset(val_encodings, y_val)

In [13]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [20]:
class BertForMultilabelClassification(torch.nn.Module):
    def __init__(self):
        super(BertForMultilabelClassification, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        _, output_1= self.l1(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

In [21]:
model = BertForMultilabelClassification()
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




BertForMultilabelClassification(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [22]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [23]:
optim = AdamW(model.parameters(), lr=5e-5)
def loss_fn(outputs, labels):
    return torch.nn.BCEWithLogitsLoss()(outputs, labels)

In [26]:
def train(epoch):
    model.train() # Set the model to train mode
    for _,batch in enumerate(train_loader, 0):
        optim.zero_grad()
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        
        loss = loss_fn(outputs, labels)
        if _%2000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optim.step()

In [27]:
EPOCHS = 3
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.6524143218994141
Epoch: 0, Loss:  0.02291879430413246
Epoch: 0, Loss:  0.08846837282180786
Epoch: 0, Loss:  0.07177184522151947
Epoch: 1, Loss:  0.0006505361525341868
Epoch: 1, Loss:  0.0471428707242012
Epoch: 1, Loss:  0.017754212021827698
Epoch: 1, Loss:  0.07329648733139038
Epoch: 2, Loss:  0.010423369705677032
Epoch: 2, Loss:  0.0015257842605933547
Epoch: 2, Loss:  0.03588869422674179
Epoch: 2, Loss:  0.01760292798280716


In [28]:
val_loader = DataLoader(val_dataset, batch_size=16)

In [29]:
def validation():
    model.eval()
    true_labels=[]
    model_preds=[]
    with torch.no_grad():
        for _, batch in enumerate(val_loader, 0):
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            true_labels.extend(labels.cpu().detach().numpy().tolist())
            model_preds.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return model_preds, true_labels

In [31]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

preds, labels = validation()
preds = np.array(preds) >= 0.5
accuracy = accuracy_score(labels, preds)
f1_score_micro = f1_score(labels, preds, average='micro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")

Accuracy Score = 0.9237035876547078
F1 Score (Micro) = 0.76703213610586
