# Text Classification Using Transformer Networks (BERT)

Some initialization:

In [None]:
# !pip3 install datasets
# !pip3 install transformers
# !pip install -U accelerate
# !pip install -U transformers
# !pip3 install datasets

In [None]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

Read the train/dev/test datasets and create a HuggingFace `Dataset` object:

In [None]:
def read_league_data(filename):
    # read csv file
    df = pd.read_csv(filename, header=0)
    # Get only the text and label columns
    return df[["message","toxicity_label"]]

In [None]:
labels = open('classes.txt').read().splitlines()
data = read_league_data('dataToxic.csv')
print(labels)
data = data.rename(columns={"toxicity_label": "label"})
data['message'] = data['message'].str.replace(',',' ',regex=False)
data['label'] = data['label'].apply(lambda x: 1 if x == 'toxic' else 0)
data

In [None]:
from sklearn.model_selection import train_test_split

train_df, eval_and_test_df = train_test_split(data, train_size=0.8, random_state
= 4)
eval_df, test_df = train_test_split(eval_and_test_df, train_size=0.5, random_state = 4)
train_df.reset_index(inplace=True, drop=True)
eval_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

print(f'train rows: {len(train_df.index):,}')
print(f'eval rows: {len(eval_df.index):,}')
print(f'test rows: {len(test_df.index):,}')

In [None]:
eval_df

In [None]:
from datasets import Dataset, DatasetDict

ds = DatasetDict()
ds['train'] = Dataset.from_pandas(train_df)
ds['validation'] = Dataset.from_pandas(eval_df)
ds['test'] = Dataset.from_pandas(test_df)
ds

Tokenize the texts:

In [None]:
from transformers import AutoTokenizer

transformer_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)

In [None]:
def tokenize(examples):
    return tokenizer(examples['message'], truncation=True)

train_ds = ds['train'].map(
    tokenize, 
    batched=True,
    remove_columns=['message'],
    # remove_columns=['title', 'description', 'text'],
)
eval_ds = ds['validation'].map(
    tokenize,
    batched=True,
    remove_columns=['message'],
    # remove_columns=['title', 'description', 'text'],
)
train_ds.to_pandas()

Create the transformer model:

In [None]:
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel

# https://github.com/huggingface/transformers/blob/65659a29cf5a079842e61a63d57fa24474288998/src/transformers/models/bert/modeling_bert.py#L1486

class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs,
        )
        cls_outputs = outputs.last_hidden_state[:, 0, :]
        cls_outputs = self.dropout(cls_outputs)
        logits = self.classifier(cls_outputs)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
import torch.nn as nn
from transformers import BertModel, BertPreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput

class BertLSTMForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # BERT Model
        self.bert = BertModel(config)
        # LSTM Layer (works with BERT hidden size)
        self.lstm = nn.LSTM(input_size=config.hidden_size, hidden_size=128, num_layers=1, batch_first=True, dropout=0.1)
        self.dropout = nn.Dropout(0.1)
        # Fully connected layer to output logits
        self.classifier = nn.Linear(128, config.num_labels)
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        # Pass through BERT
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # Get sequence-level output from BERT
        sequence_output = outputs.last_hidden_state
        # Pass through LSTM layer
        lstm_output, (hn, cn) = self.lstm(sequence_output)
        # Get the last hidden state from LSTM
        pooled_output = hn[-1]  # Using the last hidden state (it could be any strategy: mean, max, etc.)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        
        return SequenceClassifierOutput(loss=loss, logits=logits)


In [None]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    transformer_name,
    #num_labels=len(labels),
    num_labels=2
)

model = (
    BertForSequenceClassification
    .from_pretrained(transformer_name, config=config)
)

Create the trainer object and train:

In [None]:
from transformers import TrainingArguments

num_epochs = 2
# increase batch size
batch_size = 24
batch_size = 32
weight_decay = 0.01
model_name = f'{transformer_name}-sequence-classification'

training_args = TrainingArguments(
    output_dir=model_name,
    log_level='error',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_strategy='epoch',
    weight_decay=weight_decay,
    fp16=True
)

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    y_true = eval_pred.label_ids
    y_pred = np.argmax(eval_pred.predictions, axis=-1)
    return {'accuracy': accuracy_score(y_true, y_pred)}

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    processing_class=tokenizer,
)

In [None]:
trainer.train()

Evaluate on the test partition:

In [None]:
test_ds = ds['test'].map(
    tokenize,
    batched=True,
    remove_columns=['message'],
    # remove_columns=['title', 'description', 'text'],
)
test_ds.to_pandas()

In [None]:
output = trainer.predict(test_ds)
output

In [None]:
from sklearn.metrics import classification_report

y_true = output.label_ids
y_pred = np.argmax(output.predictions, axis=-1)
target_names = ["not toxic", "toxic"]
print(classification_report(y_true, y_pred, target_names=target_names))

In [None]:
labels = open('classes.txt').read().splitlines()
df = pd.read_csv("tagged-data.csv", header=0)
# Get only the text and label columns
df = df[["text","target"]]
print(labels)
df = df.rename(columns={"text": "message"})
df['target'] = df['target'].replace(2,1)
df

In [None]:
dota_ds = DatasetDict()
dota_ds['test'] = Dataset.from_pandas(df)
dota_ds

In [None]:
test_ds_dota = dota_ds['test'].map(
    tokenize,
    batched=True,
    remove_columns=['message'],
    # remove_columns=['title', 'description', 'text'],
)
test_ds_dota.to_pandas()

In [None]:
output_dota = trainer.predict(test_ds_dota)
output_dota

In [None]:
y_true = output_dota.label_ids
y_pred = np.argmax(output_dota.predictions, axis=-1)
target_names = ["not toxic", "toxic"]
print(classification_report(y_true, y_pred, target_names=target_names))