In [1]:
import sys

import numpy as np
import pandas as pd

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader

from sklearn.metrics import f1_score, classification_report

from tqdm import tqdm

sys.path.insert(0, '..')
from src.data_collection import get_data
from src.models import HateDataset, DistilBERTMultiClass, get_distil_hyperparams

pd.set_option("display.max_colwidth", None)

In [2]:
hate_speech_ucb = get_data()

Fetching data...


Using custom data configuration ucberkeley-dlab--measuring-hate-speech-7cb9b0b8e4d0e1dd
Reusing dataset parquet (C:\Users\UTKARSH\.cache\huggingface\datasets\parquet\ucberkeley-dlab--measuring-hate-speech-7cb9b0b8e4d0e1dd\0.0.0\0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/1 [00:00<?, ?it/s]

Processing...
Done!


In [3]:
HYPERPARAMS = get_distil_hyperparams()

In [4]:
device = HYPERPARAMS["DEVICE"]
device

device(type='cuda', index=0)

In [5]:
MAX_LEN = HYPERPARAMS["MAX_LEN"]
EPOCHS = 5
LEARNING_RATE = HYPERPARAMS["LEARNING_RATE"]
TOKENIZER = HYPERPARAMS["TOKENIZER"]

In [6]:
hate_speech_ucb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39565 entries, 0 to 39564
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        39565 non-null  object
 1   hatespeech  39565 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 618.3+ KB


In [7]:
hate_speech_ucb["hatespeech"].value_counts()

0    26608
1    12957
Name: hatespeech, dtype: int64

In [8]:
# For testing purpose
# hate_speech_ucb = hate_speech_ucb[:5000]

In [9]:
hate_speech_ucb

Unnamed: 0,text,hatespeech
0,! thank u! im transmasc and generally present masc i Maybe look a bit gay so i was wondering about how that would go. im sorry u were heckled but its good to hear things are better there now. tysm my favorite lesbian :),0
1,!Go fuck yourself faggot!,1
2,!flair [I love women and minorities],0
3,!flair [death to all niggers and gays],1
4,""" 'convoluted' genealogy of Jesus""; was that comment really necessary? I am sure that you have regular Christian viewers--like myself--who might not have appreciated that very much.",0
...,...,...
39560,🤧 The limousine liberal Jews and WASPs who want to assuage their white guilt.. SJW maladjusted gays and nigs so they can pester and bother me; bullshit wack monotheist Judeo-Christianity values that's I'm suck when I leave Chinatown,1
39561,"🤬CONGRATULATIONS #MARYLAND #BALTIMORE LIBERALS & OBAMA... YOU REACHED A NEW LOW If I didn't know better, I would have thought this was Europe. Once again, Democrat Mayor Police Video Shows Whites Being Targeted During Memorial Day Chaos in Baltimore URL",0
39562,🥳 another bitch will when another bitch can't,1
39563,"🥴🥴 next time stay your ass in the car before our ""beaner"" asses beat yah URL",1


In [10]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
val_size = 0.1

train_data = hate_speech_ucb.sample(frac=train_size, random_state=210)
test_data = hate_speech_ucb.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)
val_data = test_data.sample(frac=val_size / (1 - train_size), random_state=220).reset_index()
test_data = test_data.drop(val_data.index).reset_index(drop=True)

print(f"FULL Dataset: {hate_speech_ucb.shape}")
print(f"TRAIN Dataset: {train_data.shape}")
print(f"VAL Dataset: {val_data.shape}")
print(f"TEST Dataset: {test_data.shape}")

training_set = HateDataset(train_data, TOKENIZER, MAX_LEN)
validation_set = HateDataset(val_data, TOKENIZER, MAX_LEN)
testing_set = HateDataset(test_data, TOKENIZER, MAX_LEN)

FULL Dataset: (39565, 2)
TRAIN Dataset: (31652, 2)
VAL Dataset: (3957, 3)
TEST Dataset: (3956, 2)


In [11]:
train_params = HYPERPARAMS["TRAIN_PARAMS"]
val_params = HYPERPARAMS["DEV_PARAMS"]
test_params = HYPERPARAMS["TEST_PARAMS"]

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **val_params)
testing_loader = DataLoader(testing_set, **test_params)

In [12]:
N_CLASSES = hate_speech_ucb["hatespeech"].nunique()

In [13]:
model = DistilBERTMultiClass(n_classes=N_CLASSES)
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBERTMultiClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_

In [14]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [15]:
optimizer = AdamW(params=model.parameters(), lr=LEARNING_RATE)

In [16]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(training_loader, 0)):
        ids = data["ids"].to(device, dtype=torch.long)
        mask = data["mask"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        targets = data["targets"].to(device, dtype=torch.float)
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _ % 1000 == 0:
            print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")
        loss.backward()
        optimizer.step()

In [17]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
1it [00:02,  2.46s/it]

Epoch: 1, Loss: 0.7049654722213745


1001it [04:25,  3.50it/s]

Epoch: 1, Loss: 0.3837555944919586


1979it [08:46,  3.76it/s]
1it [00:00,  5.13it/s]

Epoch: 2, Loss: 0.565147876739502


1001it [04:28,  3.45it/s]

Epoch: 2, Loss: 0.3691707253456116


1979it [08:52,  3.72it/s]
1it [00:00,  5.99it/s]

Epoch: 3, Loss: 0.24640335142612457


1001it [04:29,  3.44it/s]

Epoch: 3, Loss: 0.22634290158748627


1979it [08:53,  3.71it/s]
1it [00:00,  5.85it/s]

Epoch: 4, Loss: 0.24425974488258362


1002it [04:28,  3.82it/s]

Epoch: 4, Loss: 0.44521617889404297


1979it [08:49,  3.74it/s]
1it [00:00,  5.99it/s]

Epoch: 5, Loss: 0.08722229301929474


1001it [04:25,  3.50it/s]

Epoch: 5, Loss: 0.08093215525150299


1979it [08:46,  3.76it/s]


In [18]:
def validation(model, loader):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(loader, 0)):
            ids = data["ids"].to(device, dtype=torch.long)
            mask = data["mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            targets = data["targets"].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [19]:
outputs, targets = validation(model, validation_loader)

final_outputs = np.argmax(outputs, axis=1)
targets = np.argmax(targets, axis=1)

3957it [00:48, 81.98it/s]


In [20]:
print(f"Got {sum(final_outputs == targets)} / {len(final_outputs)} correct")

Got 3030 / 3957 correct


In [21]:
micro_f1 = f1_score(targets, final_outputs, average="micro")
macro_f1 = f1_score(targets, final_outputs, average="macro")
weighted_f1 = f1_score(targets, final_outputs, average="weighted")

print(f"Micro F1 score:\t\t{round(micro_f1, 3)}")
print(f"Macro F1 score:\t\t{round(macro_f1, 3)}")
print(f"Weighted F1 score:\t{round(weighted_f1, 3)}")

Micro F1 score:		0.766
Macro F1 score:		0.716
Weighted F1 score:	0.757


In [22]:
print(classification_report(targets, final_outputs))

              precision    recall  f1-score   support

           0       0.79      0.88      0.83      2665
           1       0.68      0.53      0.60      1292

    accuracy                           0.77      3957
   macro avg       0.74      0.71      0.72      3957
weighted avg       0.76      0.77      0.76      3957



In [23]:
output_model_file = HYPERPARAMS["MODEL_PATH"]
output_vocab_file = HYPERPARAMS["VOCAB_PATH"]

torch.save(model.state_dict(), output_model_file)
TOKENIZER.save_vocabulary(output_vocab_file)

print("Model Saved")

Model Saved
