# Text Classification Using Transformer Networks (BERT)

Some initialization:

In [1]:
# Installations needed to run on Cyverse each time
!pip3 install datasets
!pip3 install transformers
!pip install -U accelerate
!pip install -U transformers



In [2]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


Read the train/dev/test datasets and create a HuggingFace `Dataset` object:

In [3]:
def read_league_data(filename):
    # read csv file
    df = pd.read_csv(filename, header=0)
    # Get only the text and label columns
    return df[["message","toxicity_label"]]

In [4]:
labels = ["not toxic","toxic"]
data = read_league_data('Chatlogs_with_Toxicity_Labels - Chatlogs_with_Toxicity_Labels.csv')
print(labels)
data = data.rename(columns={"message": "text","toxicity_label": "label"})
data

['not toxic', 'toxic']


Unnamed: 0,text,label
0,"report for unskilled player is useless,thx <3,...",not toxic
1,mimimi,not toxic
2,"im comming for you riven,pfft,focus Zed always...",toxic
3,"thx,top no flash,for what ? he has 2 kill in l...",toxic
4,"IIII,ISI,K,udyr top,dnt us see it?,CAMP MORE P...",not toxic
...,...,...
88083,"gj,&gt;&lt;,xD,i said ss,gj,stop go alone plz ...",toxic
88084,"i like the new un-do button,GET BACK,GET BACK,...",toxic
88085,"thx,now i can b,its ok,re top,sry,thought cait...",not toxic
88086,"take t,brb,swian,i was ogin b,i was going blue...",toxic


In [5]:
from sklearn.model_selection import train_test_split

train_df, eval_and_test_df = train_test_split(data, train_size=0.8, random_state=seed)
eval_df, test_df = train_test_split(eval_and_test_df, train_size=0.5, random_state=seed)
train_df.reset_index(inplace=True, drop=True)
eval_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

print(f'train rows: {len(train_df.index):,}')
print(f'eval rows: {len(eval_df.index):,}')
print(f'test rows: {len(test_df.index):,}')

train rows: 70,470
eval rows: 8,809
test rows: 8,809


In [6]:
eval_df

Unnamed: 0,text,label
0,"npnp gogogo,no ward drake,....,ff,....,stop bl...",toxic
1,"lache t es fantome udyr,no",not toxic
2,"matchmaking &lt;3,first time lol?,glad i let y...",toxic
3,"lol,ss,ss,thx,ss,rofl,O_O"",well crap,its ok,:D",not toxic
4,was thinking if i eat sushy or chinese tomorro...,toxic
...,...,...
8804,"k,ss,re,fail srry,nice bot,ss,help,nice,thx,ni...",not toxic
8805,"ss,nice xD,gg",not toxic
8806,":D,;D,gg",not toxic
8807,"gj,why?,flame on,YES HE DOES,omw,gg wp",not toxic


In [7]:
from datasets import Dataset, DatasetDict

ds = DatasetDict()
ds['train'] = Dataset.from_pandas(train_df)
ds['validation'] = Dataset.from_pandas(eval_df)
ds['test'] = Dataset.from_pandas(test_df)
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 70470
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 8809
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 8809
    })
})

Tokenize the texts:

In [8]:
from transformers import AutoTokenizer

transformer_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(transformer_name)

In [9]:
def tokenize(examples):
    return tokenizer(examples['text'], truncation=True)

train_ds = ds['train'].map(
    tokenize, 
    batched=True,
    remove_columns=['text'],
    # remove_columns=['title', 'description', 'text'],
)
eval_ds = ds['validation'].map(
    tokenize,
    batched=True,
    remove_columns=['text'],
    # remove_columns=['title', 'description', 'text'],
)
train_ds.to_pandas()

Map:   0%|          | 0/70470 [00:00<?, ? examples/s]

Map:   0%|          | 0/8809 [00:00<?, ? examples/s]

Unnamed: 0,label,input_ids,token_type_ids,attention_mask
0,not toxic,"[101, 176, 1403, 102]","[0, 0, 0, 0]","[1, 1, 1, 1]"
1,toxic,"[101, 12477, 1186, 1408, 1894, 117, 134, 176, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,not toxic,"[101, 183, 1643, 117, 19563, 10486, 10486, 463...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,toxic,"[101, 20049, 3121, 111, 2113, 117, 192, 1604, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,toxic,"[101, 18257, 2047, 190, 181, 2723, 4355, 2386,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...
70465,toxic,"[101, 21534, 117, 11769, 1161, 11769, 1161, 11...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
70466,not toxic,"[101, 156, 11185, 2346, 117, 175, 3715, 1358, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
70467,toxic,"[101, 146, 1301, 1106, 4417, 12551, 117, 1185,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
70468,not toxic,"[101, 178, 21884, 1444, 195, 13252, 13358, 116...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"


Create the transformer model:

In [10]:
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel

# https://github.com/huggingface/transformers/blob/65659a29cf5a079842e61a63d57fa24474288998/src/transformers/models/bert/modeling_bert.py#L1486

class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs,
        )
        cls_outputs = outputs.last_hidden_state[:, 0, :]
        cls_outputs = self.dropout(cls_outputs)
        logits = self.classifier(cls_outputs)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [11]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    transformer_name,
    num_labels=len(labels),
    # num_labels=2
)

model = (
    BertForSequenceClassification
    .from_pretrained(transformer_name, config=config)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Create the trainer object and train:

In [12]:
from transformers import TrainingArguments

num_epochs = 10
batch_size = 24
weight_decay = 0.01
model_name = f'{transformer_name}-sequence-classification'

training_args = TrainingArguments(
    output_dir=model_name,
    log_level='error',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    weight_decay=weight_decay,
)



In [13]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    y_true = eval_pred.label_ids
    y_pred = np.argmax(eval_pred.predictions, axis=-1)
    return {'accuracy': accuracy_score(y_true, y_pred)}

In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [16]:
trainer.train()

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`label` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

Evaluate on the test partition:

In [None]:
test_ds = ds['test'].map(
    tokenize,
    batched=True,
    remove_columns=['text'],
    # remove_columns=['title', 'description', 'text'],
)
test_ds.to_pandas()

In [None]:
output = trainer.predict(test_ds)
output

In [None]:
from sklearn.metrics import classification_report

y_true = output.label_ids
y_pred = np.argmax(output.predictions, axis=-1)
target_names = ["not toxic","kinda toxic","toxic"]
print(classification_report(y_true, y_pred, target_names=target_names))