In [44]:
from transformers import AutoTokenizer, AutoModel, TFAutoModel, AutoModelForSequenceClassification, TFAutoModelForSequenceClassification, BertModel, BertTokenizer
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Flatten, Concatenate, GlobalAveragePooling1D
import torch
from torchsummary import summary
import tensorflow as tf
import datasets
import numpy as np
from scipy.special import softmax
from sklearn import metrics
from time import time
import gc

## Load Dataset

In [2]:
#FRENK dataset
ds = datasets.load_dataset("classla/FRENK-hate-en","binary")
texts_train = ds["train"]["text"]
labels_train = np.array(ds["train"]["label"])
texts_test = ds["test"]["text"]
labels_test = np.array(ds["test"]["label"])

Found cached dataset frenk-hate-en (C:/Users/Pedro/.cache/huggingface/datasets/classla___frenk-hate-en/binary/0.0.0/81373cea138e2fc4156010c45a19880095b891a74c39768895384d352bfe1ca0)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
max_length = 150
for i in range(len(texts_train)):
  if len(texts_train[i]) > max_length:
    texts_train[i] = texts_train[i][:max_length]
    
for i in range(len(texts_test)):
  if len(texts_test[i]) > max_length:
    texts_test[i] = texts_test[i][:max_length]

In [46]:
print(np.array(texts_test).shape)

(2301,)


## Load models and Compute outputs (If the outputs are already saved you can skip this step)

In [35]:
#roBERTa models
sent_MOD = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
aggr_MOD = f"cardiffnlp/twitter-roberta-base-offensive"
gen_MOD = f"roberta-base"

#Load tokenizer and models
tokenizer = AutoTokenizer.from_pretrained(sent_MOD)
sent_mod = TFAutoModelForSequenceClassification.from_pretrained(sent_MOD)
aggr_mod = TFAutoModelForSequenceClassification.from_pretrained(aggr_MOD)
gen_mod = TFAutoModel.from_pretrained(gen_MOD)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-offensive.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint o

In [2]:
def Compute_Embeddings(tokenizer,sent_mod,aggr_mod,gen_mod,texts,batch_size):
  # Split texts into batches
  batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
  batch_out_1 = []
  batch_out_2 = []
  batch_out_3 = []
  for batch in batches:
      encoded_inputs = tokenizer(batch,padding=True,return_tensors='tf')
      # Compute the outputs of the modules as numpy arrays.
      emb_1 = sent_mod(encoded_inputs)[0]
      scores_1 = []
      for i in range(emb_1.shape[0]):
            scores_1.append(softmax(emb_1[i].numpy()))
      scores_1 = np.array(scores_1)
      batch_out_1.append(scores_1)

      emb_2 = aggr_mod(encoded_inputs)[0]
      scores_2 = []
      for i in range(emb_2.shape[0]):
            scores_2.append(softmax(emb_2[i].numpy()))
      scores_2 = np.array(scores_2)
      batch_out_2.append(scores_2)

      emb_3 = gen_mod(encoded_inputs)[0]
      pool_3 = GlobalAveragePooling1D()(emb_3).numpy()
      batch_out_3.append(pool_3)
    
      encoded_inputs = None
      emb_1 = None
      emb_2 = None
      emb_3 = None
      gc.collect() 
    
  outs_1 = np.concatenate(batch_out_1,axis=0)
  outs_2 = np.concatenate(batch_out_2,axis=0)
  outs_3 = np.concatenate(batch_out_3,axis=0)  

  return outs_1,outs_2,outs_3

In [20]:
#### Train set

batch_size = 32

t = time()
outs_sent,outs_aggr,outs_gen = Compute_Embeddings(tokenizer,sent_mod,aggr_mod,gen_mod,texts_train,batch_size)
print("Elapsed time:",time()-t)

np.save('mod_outs/sent_train_outs.npy',outs_sent)
np.save('mod_outs/aggr_train_outs.npy',outs_aggr)
np.save('mod_outs/gen_train_outs.npy',outs_gen)

Elapsed time: 176.75620126724243


In [9]:
#### Test set

t = time()
outs_sent,outs_aggr,outs_gen = Compute_Embeddings(tokenizer,sent_mod,aggr_mod,gen_mod,texts_test,batch_size)
print("Elapsed time:",time()-t)

np.save('mod_outs/sent_test_outs.npy',outs_sent)
np.save('mod_outs/aggr_test_outs.npy',outs_aggr)
np.save('mod_outs/gen_test_outs.npy',outs_gen)

Elapsed time: 48.96830487251282


## Load outputs

In [4]:
outs_sent_train = np.load('mod_outs/sent_train_outs.npy')
outs_aggr_train = np.load('mod_outs/aggr_train_outs.npy')
outs_gen_train = np.load('mod_outs/gen_train_outs.npy')

outs_sent_test = np.load('mod_outs/sent_test_outs.npy')
outs_aggr_test = np.load('mod_outs/aggr_test_outs.npy')
outs_gen_test = np.load('mod_outs/gen_test_outs.npy')

In [5]:
print(outs_gen_train.shape)

(8404, 768)


## Hate prediction with the 3 models

In [6]:
conc_out_train = np.concatenate([outs_sent_train, outs_aggr_train, outs_gen_train], axis=1)
conc_out_test = np.concatenate([outs_sent_test, outs_aggr_test, outs_gen_test], axis=1)

In [7]:
hate_model = Sequential([
      Flatten(),
      Dense(128, activation='relu'),
      Dense(128, activation='relu'),
      Dense(1, activation='sigmoid')
  ])
hate_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [8]:
t = time()
hate_model.fit(conc_out_train, labels_train, epochs=5, batch_size=32)
print("Elapsed time:",time()-t)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Elapsed time: 6.6339616775512695


In [9]:
print("Train set:")
probs = hate_model.predict(conc_out_train)
preds = (probs >= 0.5).astype(int)
acc = metrics.accuracy_score(preds,labels_train)
f1 = metrics.f1_score(preds,labels_train,zero_division=1)
print("Accuracy:",acc)
print("f1_score:",f1)

Train set:
Accuracy: 0.7961684911946693
f1_score: 0.7271858576206403


In [10]:
print("Test set:")
probs = hate_model.predict(conc_out_test)
preds = (probs >= 0.5).astype(int)
acc = metrics.accuracy_score(preds,labels_test)
f1 = metrics.f1_score(preds,labels_test,zero_division=1)
print("Accuracy:",acc)
print("f1_score:",f1)

Test set:
Accuracy: 0.7692307692307693
f1_score: 0.7018528916339135


## Hate prediction with the general model

In [11]:
hate_model_gen = Sequential([
      Flatten(),
      Dense(128, activation='relu'),
      Dense(128, activation='relu'),
      Dense(1, activation='sigmoid')
  ])
hate_model_gen.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
t = time()
hate_model_gen.fit(outs_gen_train, labels_train, epochs=5, batch_size=32)
print("Elapsed time:",time()-t)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Elapsed time: 7.545494079589844


In [13]:
print("Train set:")
probs = hate_model_gen.predict(outs_gen_train)
preds = (probs >= 0.5).astype(int)
acc = metrics.accuracy_score(preds,labels_train)
f1 = metrics.f1_score(preds,labels_train,zero_division=1)
print("Accuracy:",acc)
print("f1_score:",f1)

Train set:
Accuracy: 0.7652308424559734
f1_score: 0.7196248401307376


In [14]:
print("Test set:")
probs = hate_model_gen.predict(outs_gen_test)
preds = (probs >= 0.5).astype(int)
acc = metrics.accuracy_score(preds,labels_test)
f1 = metrics.f1_score(preds,labels_test,zero_division=1)
print("Accuracy:",acc)
print("f1_score:",f1)

Test set:
Accuracy: 0.7309865275966971
f1_score: 0.6897243107769424


## Compute HateBERT outputs (If these outputs are already saved you can skip this step)

In [98]:
model_dir = "HateBERT_hateval"
HateBERT = BertModel.from_pretrained(
    model_dir # Use pre-trained model from its directory, change this to use a pre-trained model from bert
)
HateBERT.eval()
HateBERT_tok = BertTokenizer.from_pretrained(model_dir)

Some weights of the model checkpoint at HateBERT_hateval were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [99]:
def Comp_Embed_HateBERT(tokenizer,mod,texts,batch_size):
  # Split texts into batches
  batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
  batch_out = []

  with torch.no_grad():
      for batch in batches:
          encoded_inputs = tokenizer(batch,padding=True,return_tensors='pt')
          # Compute the output of the module as a numpy array.

          emb = mod(**encoded_inputs)[0]
          tf_emb = tf.convert_to_tensor(emb.numpy())  
          pool = GlobalAveragePooling1D()(tf_emb).numpy()
          batch_out.append(pool)

          encoded_inputs = None
          tf_emb = None
          gc.collect() 
    
  outs = np.concatenate(batch_out,axis=0)
  return outs

In [100]:
###Train set
batch_size = 32

t = time()
outs_train = Comp_Embed_HateBERT(HateBERT_tok,HateBERT,texts_train,batch_size)
print("Elapsed time:",time()-t)
print(outs_train.shape)

np.save('mod_outs/HateBERT_train_outs.npy',outs_train)

Elapsed time: 530.0718879699707
(8404, 768)


In [101]:
###Test set
t = time()
outs_test = Comp_Embed_HateBERT(HateBERT_tok,HateBERT,texts_test,batch_size)
print("Elapsed time:",time()-t)
print(outs_test.shape)

np.save('mod_outs/HateBERT_test_outs.npy',outs_test)

Elapsed time: 140.3664355278015
(2301, 768)


## Hate prediction with HateBERT

In [102]:
outs_train = np.load('mod_outs/HateBERT_train_outs.npy')
outs_test = np.load('mod_outs/HateBERT_test_outs.npy')

In [103]:
classifier_HateBERT = Sequential([
      Flatten(),
      Dense(128, activation='relu'),
      Dense(128, activation='relu'),
      Dense(1, activation='sigmoid')
  ])
classifier_HateBERT.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [104]:
t = time()
classifier_HateBERT.fit(outs_train, labels_train, epochs=5, batch_size=32)
print("Elapsed time:",time()-t)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Elapsed time: 9.10550570487976


In [105]:
print("Train set:")
probs = classifier_HateBERT.predict(outs_train)
preds = (probs >= 0.5).astype(int)
acc = metrics.accuracy_score(preds,labels_train)
f1 = metrics.f1_score(preds,labels_train,zero_division=1)
print("Accuracy:",acc)
print("f1_score:",f1)

Train set:
Accuracy: 0.8788672060923369
f1_score: 0.8242403314917126


In [106]:
print("Test set:")
probs = classifier_HateBERT.predict(outs_test)
preds = (probs >= 0.5).astype(int)
acc = metrics.accuracy_score(preds,labels_test)
f1 = metrics.f1_score(preds,labels_test,zero_division=1)
print("Accuracy:",acc)
print("f1_score:",f1)

Test set:
Accuracy: 0.7357670578009561
f1_score: 0.6279069767441859
