In [None]:
# Import necessary libraries
import torch


In [16]:
# Check if PyTorch can see CUDA
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("CUDA device count:", torch.cuda.device_count())
    print("CUDA device name:", torch)


PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA device count: 1
CUDA device name: <module 'torch' from '/home/rayan/miniconda3/envs/sentinai_env/lib/python3.11/site-packages/torch/__init__.py'>


In [58]:
# Load the HateXplain dataset
from datasets import load_dataset
hatexplain = load_dataset("hatexplain")

In [59]:
# Display basic information about the dataset
print(hatexplain)

# Display an example from the training set
print("\nExample from training set:")
print(hatexplain['train'][0])

DatasetDict({
    train: Dataset({
        features: ['id', 'annotators', 'rationales', 'post_tokens'],
        num_rows: 15383
    })
    validation: Dataset({
        features: ['id', 'annotators', 'rationales', 'post_tokens'],
        num_rows: 1922
    })
    test: Dataset({
        features: ['id', 'annotators', 'rationales', 'post_tokens'],
        num_rows: 1924
    })
})

Example from training set:
{'id': '23107796_gab', 'annotators': {'label': [0, 2, 2], 'annotator_id': [203, 204, 233], 'target': [['Hindu', 'Islam'], ['Hindu', 'Islam'], ['Hindu', 'Islam', 'Other']]}, 'rationales': [[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'post_tokens': ['u', 'really', 'think', 'i', 'would', 'not', 'have', 'been', 'raped', 'by', 'feral', 'hindu', 'or', 'muslim', 'back', 'in', 'india', 'or', 'bangladesh', 'and', 'a', 'neo', 'nazi', 'wo

In [60]:
from collections import Counter
from datasets import DatasetDict, Dataset

def process_dataset(data):
    processed_splits = {}
    
    for split in ["train", "validation", "test"]:
        split_data = []
        for item in data[split]:
            text = " ".join(item["post_tokens"])
            labels = item["annotators"]["label"]
            label = Counter(labels).most_common(1)[0][0]
            split_data.append({"text": text, "label": label})
        
        processed_splits[split] = Dataset.from_list(split_data)
        
    return DatasetDict(processed_splits)


In [64]:
# Process the train and validation data
hatexplain_df = process_dataset(hatexplain)

# Check class distribution 
print(Counter(hatexplain_df["train"]["label"]))
print(Counter(hatexplain_df["validation"]["label"]))
print(Counter(hatexplain_df["test"]["label"]))


Counter({1: 6251, 0: 4748, 2: 4384})
Counter({1: 781, 0: 593, 2: 548})
Counter({1: 782, 0: 594, 2: 548})


In [65]:
from transformers import AutoTokenizer

# Instantiating  AutoTokenizer will directly create a class of the relevant architecture.
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2", model_max_length=512)

In [66]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

cache_files = {"train": "~/.cache/hatexplain/hatexplain_train_tokenized.arrow",
               "validation": "~/.cache/hatexplain/hatexplain_val_tokenized.arrow",
               "test": "~/.cache/hatexplain/hatexplain_test_tokenized.arrow"
               }
               
# Path to the local cache files, where the current computation from the following function will be stored. 
# Caching saves RAM when working with large datasets and saves time instead of doing transformations on the fly.
tokenized_hatexplain = hatexplain_df.map(tokenize_function, batched=True, cache_file_names=cache_files)

In [67]:
tokenized_hatexplain = tokenized_hatexplain.remove_columns(["text"])
tokenized_hatexplain = tokenized_hatexplain.rename_column("label", "labels")
tokenized_hatexplain.set_format("torch")


# create a DataLoader for your training and test datasets so you can iterate over batches of data:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_hatexplain["train"], shuffle=True, batch_size=64)
val_dataloader = DataLoader(tokenized_hatexplain["validation"], shuffle=True, batch_size=64)
test_dataloader = DataLoader(tokenized_hatexplain["test"], batch_size=64)

In [68]:
id2label = {0: "HATESPEECH", 1: "NORMAL", 2: "OFFENSIVE"}
label2id = {"HATESPEECH": 0, "NORMAL": 1, "OFFENSIVE": 2}

In [69]:
from transformers import AutoModelForSequenceClassification


model_bert_l4 = AutoModelForSequenceClassification.from_pretrained(
    "google/bert_uncased_L-4_H-128_A-2", num_labels=3, id2label=id2label, label2id=label2id)

print("bert-tiny variant number of parameters: ", model_bert_l4.num_parameters())


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


bert-tiny variant number of parameters:  4782851


In [70]:
from torch.optim import AdamW

optimizer = AdamW(model_bert_l4.parameters(), lr=5e-5)

from transformers import get_scheduler

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=1, num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_bert_l4.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [71]:
# use the tqdm library to add a progress bar over the number of training steps
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

# put the model in train mode
model_bert_l4.train()

# iterate over epochs
for epoch in range(num_epochs):
    # iterate over batches in training set
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model_bert_l4(**batch)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        progress_bar.update(1)

  0%|          | 0/241 [00:00<?, ?it/s]

In [46]:
import evaluate

# define the metric you want to use to evaluate your model
metric = evaluate.load("accuracy")
progress_bar = tqdm(range(len(val_dataloader)))

# put the model in eval mode
model_bert_l4.eval()
# iterate over batches of evaluation dataset
for batch in val_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model_bert_l4(**batch)
        
    logits = outputs.logits

    # use argmax to get the predicted class
    predictions = torch.argmax(logits, dim=-1)
    
    metric.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar.update(1)
# calculate a metric by  calling metric.compute()
metric.compute()



  0%|          | 0/31 [00:00<?, ?it/s]

{'accuracy': 0.6748178980228928}

In [45]:
text_hate = "People like you dont belong here. Go back to your country — you are ruining everything."
text_normal = "I just watched the new sci-fi movie last night. The visual effects were amazing!"
text_offensive = "You're such an idiot, seriously. Do you ever think before you speak?"
text = [text_hate, text_normal, text_offensive]

inputs = tokenizer(text, return_tensors="pt",  padding = True, truncation = True)

with torch.no_grad():
    logits = model_bert_l4(**inputs.to(device)).logits


#get the predicted id class
predicted_class_id = logits.argmax(dim=1)

# get the predicted class name
for pred in predicted_class_id:
  print(model_bert_l4.config.id2label[pred.item()])

NORMAL
NORMAL
NORMAL
