# Objective: To detect hate speech using Transformers

1. Loading and Preprocessing the data

2. Training classifier using pre trained ALBERT

3. Validating and quantifying the model performance

In [None]:
# Install transformers and PyTorch Lightning libraries

!pip install transformers
#!pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade
!pip install pytorch-lightning
!pip install SentencePiece # Required for AlbertTokenizer

In [None]:
# Import required libraries

import pandas as pd
import re
import sklearn
from sklearn.model_selection import train_test_split
from google.colab import drive

from transformers import AlbertTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
import pytorch_lightning as pl

In [None]:
# Mount Google Drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Loading and Preprocessing the data

In [None]:
# Read train and test data

hspeech_df = pd.read_csv("gdrive/MyDrive/hate_speech_data.csv")
hspeech_df.head()

Unnamed: 0.1,Unnamed: 0,tweet,class
0,0,!!! RT @mayasolovely: As a woman you shouldn't...,0
1,1,""" momma said no pussy cats inside my doghouse """,0
2,2,"""@Addicted2Guys: -SimplyAddictedToGuys http://...",0
3,3,"""@AllAboutManFeet: http://t.co/3gzUpfuMev"" woo...",0
4,4,"""@Allyhaaaaa: Lemmie eat a Oreo &amp; do these...",0


In [None]:
# Remove username mentions

def clean_tweet(tweet):
    tweet = tweet.lower().strip()                #lowercase the tweets and remove trailing & ending spaces
    tweet = re.sub("(@[A-Za-z0-9]+)", "", tweet) # Removes words followed by @
    tweet = re.sub("([^0-9A-Za-z \t])", "", tweet) # Removes words at start of string
    return tweet

In [None]:
hspeech_df["tweet"] = hspeech_df["tweet"].apply(clean_tweet)

In [None]:
hspeech_df.head()

Unnamed: 0.1,Unnamed: 0,tweet,class
0,0,rt as a woman you shouldnt complain about cl...,0
1,1,momma said no pussy cats inside my doghouse,0
2,2,simplyaddictedtoguys httptco1jl4hi8zmf woof w...,0
3,3,httptco3gzupfumev woof woof and hot soles,0
4,4,lemmie eat a oreo amp do these dishes one ore...,0


In [None]:
X = hspeech_df["tweet"].values      #converts the series into numpy array
y = hspeech_df["class"].values

# Split the data into training and validation sets
train_tweets, val_tweets, train_labels, val_labels = train_test_split(X, y)

In [None]:
# Load pre-trained AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

In [None]:
# Tokenize the tweets

# https://huggingface.co/docs/transformers/v4.22.1/en/main_classes/tokenizer#transformers.PreTrainedTokenizer

train_tokens = tokenizer(list(train_tweets), return_tensors="pt", padding=True, truncation=True, max_length=64)
val_tokens = tokenizer(list(val_tweets), return_tensors="pt", padding=True, truncation=True, max_length=64)

In [None]:
# Create lists of tokens

device = "cuda"
trn = [train_tokens["input_ids"].to(device), train_tokens["attention_mask"].to(device),
      train_tokens["token_type_ids"].to(device), torch.tensor(train_labels).to(device)]
val = [val_tokens["input_ids"].to(device), val_tokens["attention_mask"].to(device),
      val_tokens["token_type_ids"].to(device), torch.tensor(val_labels).to(device)]

In [None]:
# Dataloader class

BATCH_SIZE = 32
class ClassificationData(pl.LightningDataModule):
    def __init__(self, trn, val):
        super().__init__()

        self.trn = DataLoader(TensorDataset(*trn), batch_size=BATCH_SIZE)
        self.val = DataLoader(TensorDataset(*val), batch_size=BATCH_SIZE)

    def train_dataloader(self): 
      return self.trn
      
    def val_dataloader(self): 
      return self.val

dls = ClassificationData(trn, val)

In [None]:
# This should return a list of 4 tensors - input_ids, attention_masks, token_type_ids, and labels
print(next(iter(dls.trn)))

[tensor([[    2,    13,  5256,  ...,     0,     0,     0],
        [    2,   931,  9262,  ...,     0,     0,     0],
        [    2, 19037,  5485,  ...,     0,     0,     0],
        ...,
        [    2,    39,   123,  ...,     0,     0,     0],
        [    2,    13,    18,  ...,     0,     0,     0],
        [    2, 22086, 16770,  ...,     0,     0,     0]], device='cuda:0'), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'), tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0'), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
        0, 1, 0, 0, 0, 0, 1, 0], device='cuda:0')]




*   input_ids of shape : 32x64
*   attention_masks of shape: 32x64
*   token_type_ids of shape: 32x64
*   labels of shape: 32x1



## Training Classifier Using Pre trained Albert


In [None]:
from transformers import AlbertModel
albert_model = AlbertModel.from_pretrained('albert-base-v2')

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.bias', 'predictions.dense.bias', 'predictions.decoder.bias', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# The AlbertClassifier class
#dropout_p is the dropout probability
#hid_dim : hiiden layer dimensions
#output_dim : dimesnsions of the output layer

class AlbertClassifier(pl.LightningModule):
    def __init__(self, dropout_p, hid_dim, output_dim):
        super().__init__()
        self.albert = albert_model
        self.dropout = torch.nn.Dropout(dropout_p)
        self.linear_1 = torch.nn.Linear(hid_dim,hid_dim)
        self.linear_2 = torch.nn.Linear(hid_dim, output_dim)
        self.loss = torch.nn.NLLLoss()

    def forward(self, input_ids, attention_mask, token_ids):
        x1 = self.albert(input_ids, attention_mask=attention_mask, token_type_ids=token_ids)[0]
        x1 = x1[:,0]
        x1 = self.dropout(torch.nn.ReLU()(self.linear_1(x1)))
        output  = torch.log_softmax(self.linear_2(x1), dim = 1)
        return output

    def training_step(self, batch, ix):
        pred = self(batch[0], batch[1], batch[2])
        loss = self.loss(pred, batch[3].view(-1))
        return loss

    def validation_step(self, batch, ix):
        pred = self(batch[0], batch[1], batch[2])
        loss = self.loss(pred, batch[3].view(-1))
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-5)

model = AlbertClassifier(0.5, 768, 2)

In [None]:
# Train the model
# dls is the object of the dataloader class

device = "cuda"
trn = pl.Trainer(max_epochs=4, gpus=1)
trn.fit(model.to(device), dls)

  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name     | Type        | Params
-----------------------------------------
0 | albert   | AlbertModel | 11.7 M
1 | dropout  | Dropout     | 0     
2 | linear_1 | Linear      | 590 K 
3 | linear_2 | Linear      | 1.5 K 
4 | loss     | NLLLoss     | 0     
-----------------------------------------
12.3 M    Trainable params
0         Non-trainable params
12.3 M    Total params
49.103    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.


In [None]:
# This prints the model architecture
print(model)

AlbertClassifier(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
   

## Evaluating the classifier

In [None]:
#Calculating the predction on validation data

device = "cuda"

pred = []
truth = []

for val_batch in iter(dls.val):
  model.to(device)

  val_pred = model(val_batch[0], val_batch[1], val_batch[2]) # model is created previously
  val_label = val_pred.data.max(1)[1].cpu().numpy()

  val_true = val_batch[3].cpu().numpy() # BATCH_SIZE is 32

  pred.extend(val_label)
  truth.extend(val_true)


In [None]:
# Calculate precision

precision_score = sklearn.metrics.precision_score(truth, pred)

print(precision_score)

0.8154613466334164


In [None]:
#Calculate classification report

from sklearn.metrics import classification_report
print(classification_report(truth,pred))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94      1029
           1       0.82      0.88      0.85       370

    accuracy                           0.92      1399
   macro avg       0.89      0.91      0.90      1399
weighted avg       0.92      0.92      0.92      1399



For hate speech detection, precision is more important than recall. Since we want the model to be absolutely sure about the data points that it predicts to be hate speech. 

In other words, we care more about the quality of the model predictions than the quantity of them.