# Multi-label Text Classification with BERT and PyTorch Lightning

## Installing & importing Libraries

In [3]:
!pip install transformers --quiet

In [4]:
!pip install lightning
!pip install -U 'wandb>=0.12.10'



In [5]:
!pip install pytorch-lightning --quiet

In [6]:
!pip install torchmetrics --quiet

In [7]:
# importing libraries
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import lightning as L
import torch
import torchmetrics
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

In [8]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_SEED = 42
import lightning as L
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
pl.seed_everything(RANDOM_SEED)

Seed set to 42


42

## Loading the Data

In [9]:
import pandas as pd
file_path = 'ri_annotated_texts_final.csv'
data= pd.read_csv(file_path)
# text cleaning
def clean_text(text:str):
    import re
    text = text.strip()
    text = re.sub(r"^-\s+", "", text)
    return text
data["Version initiale"] = data["Version initiale"].apply(clean_text)
data["Version retraitée"] = data["Version retraitée"].apply(clean_text)
data = data.groupby(by="Version initiale").aggregate({"Version retraitée":'first', "Catégorie":lambda x: ", ".join(x)}).reset_index(drop=False)
classes = sorted(list(set(", ".join(data["Catégorie"]).split(", "))))
classes
class2id = {class_:id_ for id_, class_ in enumerate(classes)}
id2class = {id_:class_ for class_, id_ in class2id.items()}

data["classes"] = [[class2id[g] for g in j.split(", ")] for j in data["Catégorie"]]
data



Unnamed: 0,Version initiale,Version retraitée,Catégorie,classes
0,173h de formation en français pour étrangers d...,"Des cours de français pour débutants, 4 après-...","Explanation, Substitution","[1, 4]"
1,96h de français pour apprendre à communiquer à...,96 heures de français pour progresser à l'oral...,Substitution,[4]
2,Accompagnement et conseils pendant et après la...,Accompagnement et conseils pendant et après la...,Transcription,[7]
3,Accompagnement individuel,Accompagnement individuel,Transcription,[7]
4,Accompagnement pour les démarches,Accompagnement pour les démarches,Transcription,[7]
...,...,...,...,...
252,savoir se présenter et se comporter en entrepr...,savoir se présenter et avoir la bonne attitude...,Substitution,[4]
253,une découverte du chantier et des métiers poss...,une découverte du chantier et des métiers poss...,Transcription,[7]
254,une présentation des métiers recherchés par le...,une présentation des métiers recherchés par le...,Transcription,[7]
255,vos coordonnées,votre nom et votre numéro,Explanation,[1]


In [10]:
# Step 2: MultiLabel Binarization
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
labels_binarized = mlb.fit_transform(data['classes'])
texts = data['Version initiale'] + " [SEP] " + data['Version retraitée']

# Step 3: Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels_binarized, test_size=0.2, random_state=42
)

## Preprocessing

## Tokenization

In [11]:
# loading tokenizer of bert base version
BERT_MODEL_NAME = "bert-base-cased" if 0 else "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

## Wrapping Tokenization process in a PyTorch Dataset

In [12]:
class ToxicCommentsDataset(Dataset):

    def __init__(self, texts, labels, tokenizer: BertTokenizer, max_token_len: int = 128 * 4):

        self.texts = tuple(texts)
        self.labels = tuple(labels)
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index: int):

        comment_text = self.texts[index] #data_row.comment_text
        labels = self.labels[index]#data_row[LABEL_COLUMNS]

        encoding = self.tokenizer.encode_plus(
            comment_text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        return dict(
            comment_text=comment_text,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.FloatTensor(labels)
        )


In [13]:
train_dataset = ToxicCommentsDataset(train_texts, train_labels, tokenizer)

In [14]:
sample_item = train_dataset[0]

In [15]:
sample_item.keys()

dict_keys(['comment_text', 'input_ids', 'attention_mask', 'labels'])

In [16]:
sample_item["comment_text"]

'Découvrir ma ville, ses activités et la culture française [SEP] Découvrir ma ville, ses activités et la culture française'

In [17]:
sample_item["labels"]

tensor([0., 0., 0., 0., 0., 0., 0., 1., 0.])

In [18]:
sample_item["input_ids"].shape

torch.Size([512])

## Loading the Bert Model

In [19]:
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)

In [20]:
sample_item["input_ids"].unsqueeze(dim=0).shape

torch.Size([1, 512])

In [21]:
prediction = bert_model(sample_item["input_ids"].unsqueeze(dim=0), sample_item["attention_mask"].unsqueeze(dim=0))

In [22]:
prediction.last_hidden_state.shape, prediction.pooler_output.shape

(torch.Size([1, 512, 768]), torch.Size([1, 768]))

### Wrapping our custom dataset into a LightningDataModule

In [23]:
class ToxicCommentsDataModule(L.LightningDataModule):

    def __init__(self, train_df, test_df, tokenizer, batch_size=0, max_token_len=128):
        super().__init__()
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self, *args, **kwargs):
        self.train_dataset = ToxicCommentsDataset(train_texts, train_labels, tokenizer)
        
        self.test_dataset = ToxicCommentsDataset(val_texts, val_labels, tokenizer)
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, num_workers=4)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, num_workers=4)

In [24]:
N_EPOCHS = 10
BATCH_SIZE = 32

data_module = ToxicCommentsDataModule(1, 1, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

## Modeling

### Evaluation

In [25]:
criterion = nn.BCELoss()

prediction = torch.FloatTensor(
    [10.95873564, 1.07321467, 1.58524066, 0.03839076, 15.72987556, 1.09513213]
)

labels = torch.FloatTensor(
  [1., 0., 0., 0., 1., 0.]
) 

In [26]:
torch.sigmoid(prediction)

tensor([1.0000, 0.7452, 0.8299, 0.5096, 1.0000, 0.7493])

In [27]:
output = criterion(torch.sigmoid(prediction), labels)
output

tensor(0.8725)

### Converting Bert representation to a classification task & packing it into LightningModule

In [28]:
steps_per_epoch=len(train_texts) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS

In [29]:
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

(12, 60)

In [30]:
class ToxicCommentTagger(L.LightningModule):

  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCELoss()

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)    
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def on_train_epoch_end(self, *args):
    # trainer, pl_module
    if 0:
      labels = []
      predictions = []
      for output in outputs:
        for out_labels in output["labels"].detach().cpu():
          labels.append(out_labels)
        for out_predictions in output["predictions"].detach().cpu():
          predictions.append(out_predictions)

      labels = torch.stack(labels).int()
      predictions = torch.stack(predictions)

      for i, name in enumerate(LABEL_COLUMNS):
        class_roc_auc = auroc(predictions[:, i], labels[:, i])
        self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)


  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=2e-5)

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )

    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )
     

In [31]:
model = ToxicCommentTagger(
  n_classes=len(classes),
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps 
)

## Training

In [42]:
checkpoint_callback = ModelCheckpoint(
  dirpath="/safespace/checkpoints",
  filename="best-checkpoint",
  save_top_k=0,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

In [43]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

In [46]:
import lightning as L
from lightning.pytorch.loggers import WandbLogger
wandb_logger = WandbLogger(project="MNIST")

trainer = L.Trainer(
  #logger=wandb_logger,
  #checkpoint_callback=checkpoint_callback,
  #callbacks=[early_stopping_callback],
  max_epochs=1,#N_EPOCHS,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [47]:
import os
os.environ["TOKENIZERS_PARALLELISM"]="false"
trainer.fit(model, data_module)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type      | Params | Mode 
-------------------------------------------------
0 | bert       | BertModel | 108 M  | eval 
1 | classifier | Linear    | 6.9 K  | train
2 | criterion  | BCELoss   | 0      | train
-------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.269   Total estimated model params size (MB)
2         Modules in train mode
228       Modules in eval mode


Epoch 0: 100%|██████████| 7/7 [00:29<00:00,  0.24it/s, v_num=1, train_loss=0.460, val_loss=0.462]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 7/7 [00:42<00:00,  0.16it/s, v_num=1, train_loss=0.460, val_loss=0.462]


In [48]:
model.freeze()
model.eval()

ToxicCommentTagger(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

## Predictions

In [54]:
test_comment = "Une adhésion de 15 € / an à l'association est demandée lors de l'inscription."

In [55]:
encoding = tokenizer.encode_plus(
  test_comment,
  add_special_tokens=True,
  max_length=512,
  return_token_type_ids=False,
  padding="max_length",
  return_attention_mask=True,
  return_tensors='pt',
)

In [56]:
_, test_prediction = model(encoding["input_ids"], encoding["attention_mask"])
test_prediction = test_prediction.flatten().numpy()

for label, prediction in zip(classes, test_prediction):
  print(f"{label}: {prediction}")

Compression: 0.2682689130306244
Explanation: 0.42280417680740356
Modulation: 0.284383624792099
Omission: 0.264065146446228
Substitution: 0.3774208128452301
Synonymy: 0.24224412441253662
Syntactic: 0.2658673822879791
Transcription: 0.2548842430114746
Transposition: 0.22289644181728363


In [70]:
THRESHOLD = 0.5

test_comment = "You are such a loser! You'll regret everything you've done to me!"
encoding = tokenizer.encode_plus(
  test_comment,
  add_special_tokens=True,
  max_length=512,
  return_token_type_ids=False,
  padding="max_length",
  return_attention_mask=True,
  return_tensors='pt',
)

_, test_prediction = model(encoding["input_ids"], encoding["attention_mask"])
test_prediction = test_prediction.flatten().numpy()

for label, prediction in zip(LABEL_COLUMNS, test_prediction):
  if prediction < THRESHOLD:
    continue
  print(f"{label}: {prediction}")

toxic: 0.9928410053253174
insult: 0.906752347946167


## Evaluation

In [73]:
MAX_TOKEN_COUNT = 512

In [74]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = model.to(device)

val_dataset = ToxicCommentsDataset(
  val_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

predictions = []
labels = []

for item in tqdm(val_dataset):
  _, prediction = trained_model(
    item["input_ids"].unsqueeze(dim=0).to(device), 
    item["attention_mask"].unsqueeze(dim=0).to(device)
  )
  predictions.append(prediction.flatten())
  labels.append(item["labels"].int())

predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()

  0%|          | 0/7979 [00:00<?, ?it/s]

In [77]:
accuracy(predictions, labels, threshold=THRESHOLD)

tensor(0.9720)

In [78]:
print("AUROC per tag")
for i, name in enumerate(LABEL_COLUMNS):
  tag_auroc = auroc(predictions[:, i], labels[:, i], pos_label=1)
  print(f"{name}: {tag_auroc}")

AUROC per tag
toxic: 0.9726083278656006
severe_toxic: 0.9910653829574585
obscene: 0.9901463985443115
threat: 0.9314780235290527
insult: 0.9838355183601379
identity_hate: 0.9806649088859558


In [79]:
y_pred = predictions.numpy()
y_true = labels.numpy()

upper, lower = 1, 0

y_pred = np.where(y_pred > THRESHOLD, upper, lower)

print(classification_report(
  y_true, 
  y_pred, 
  target_names=LABEL_COLUMNS, 
  zero_division=0
))

               precision    recall  f1-score   support

        toxic       0.50      0.96      0.66       748
 severe_toxic       0.45      0.56      0.50        80
      obscene       0.76      0.87      0.81       421
       threat       0.27      0.46      0.34        13
       insult       0.69      0.77      0.73       410
identity_hate       0.52      0.65      0.58        71

    micro avg       0.58      0.86      0.69      1743
    macro avg       0.53      0.71      0.60      1743
 weighted avg       0.60      0.86      0.70      1743
  samples avg       0.08      0.08      0.08      1743



We now know that the model makes mistakes on the tags with low amounts of samples.