In [38]:
import pandas as pd
import numpy as np
import os
import json
import seaborn as sns
import matplotlib.pyplot as plt
import torch

from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

from scipy.special import softmax



In [6]:
import pandas as pd
import re
import string

def remove_emojis(text):
    return ''.join(char for char in text if char in (string.ascii_letters + string.digits + string.punctuation + ' '))

def remove_tags(text):
    return re.sub(r'@\w+', '', text)

def remove_multispace(text):
    return re.sub(r'[\s]+|[\t]|[.,"\']', ' ', text)

def preprocess_pipeline(text):

    text = remove_emojis(text)
    text = remove_tags(text)
    text = remove_multispace(text)

    return text

In [7]:
olid = pd.read_csv('data/olid-train-small.csv')
olid['text'] = olid['text'].apply(preprocess_pipeline)

hasoc = pd.read_csv('data/hasoc-train.csv')
hasoc['text'] = hasoc['text'].apply(preprocess_pipeline)

test_data = pd.read_csv('data/olid-test.csv')
test_data['text'] = test_data['text'].apply(preprocess_pipeline)

## training setup from fBert paper:
We used a batch-size of eight, Adam optimiser
with learning rate 1e−4, and a linear learning rate
warm-up over 10% of the training data. During the
training process, the parameters of the transformer
model, as well as the parameters of the subsequent
layers, were updated. The models were trained
using only training data. Furthermore, they were
evaluated while training using an evaluation set that
had one fifth of the rows in training data. We performed early stopping if the evaluation loss did not
improve over ten evaluation steps. All the models
were trained for three epochs.

In [12]:
#experimental setup:
model_args = ClassificationArgs()
model_args.train_batch_size = 8
model_args.num_train_epochs = 3
model_args.learning_rate = 1e-4
model_args.warmup_ratio = 0.1
model_args.evaluate_during_training_steps = 20
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True
model_args.use_early_stopping = True
model_args.early_stopping_patience = 30 #this was increased to 20, not 10 as in fbret paper
#utilities:

model_args.save_eval_checkpoints = False
model_args.overwrite_output_dir = True
model_args.use_multiprocessing=False
model_args.use_multiprocessing_for_evaluation=False

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [13]:
model_args.best_model_dir ='outputs/bert_olid'
# Load your training and evaluation data as pandas DataFrames
eval_fraction = 0.15

# Randomly sample data for evaluation set
eval_data = olid.sample(frac=eval_fraction, random_state=42)

# Data not included in the evaluation set is used for training
train_data = olid.drop(eval_data.index)


# Create a ClassificationModel
bert_model_olid = ClassificationModel(
    "bert", "bert-base-uncased", args=model_args,
)

# Fine-tune the model
bert_model_olid.train_model(train_data, eval_df=eval_data, output_dir='outputs/bert_olid')

# Make predictions on new data
#predictions, raw_outputs = model.predict(["Example text 1", "Example text 2"])

# Evaluate the model
bert_model_olid_result, bert_model_olid_outputs, bert_model_olid_wrong_predictions = bert_model_olid.eval_model(eval_data)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/622 [00:00<?, ?it/s]



Running Epoch 1 of 3:   0%|          | 0/622 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/110 [00:00<?, ?it/s]

In [14]:
model_args.best_model_dir ='outputs/hatebert_olid'
# Load your training and evaluation data as pandas DataFrames
eval_fraction = 0.15

# Randomly sample data for evaluation set
eval_data = olid.sample(frac=eval_fraction, random_state=42)

# Data not included in the evaluation set is used for training
train_data = olid.drop(eval_data.index)

# Create a ClassificationModel
hatebert_model_olid = ClassificationModel(
    'bert', 'GroNLP/hateBERT', args=model_args
)

# Fine-tune the model
hatebert_model_olid.train_model(train_data, eval_df=eval_data, output_dir='output/hatebert_olid')

# Make predictions on new data
#predictions, raw_outputs = model.predict(["Example text 1", "Example text 2"])

# Evaluate the model
hatebert_model_olid_result, hatebert_model_olid_outputs, hatebert_model_olid_wrong_predictions = hatebert_model_olid.eval_model(eval_data)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/622 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/622 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/110 [00:00<?, ?it/s]

In [15]:
model_args.best_model_dir ='outputs/fbert_olid'
# Load your training and evaluation data as pandas DataFrames
eval_fraction = 0.15

# Randomly sample data for evaluation set
eval_data = olid.sample(frac=eval_fraction, random_state=42)

# Data not included in the evaluation set is used for training
train_data = olid.drop(eval_data.index)

# Create a ClassificationModel
fbert_model_olid = ClassificationModel(
    'bert', 'diptanu/fBERT', args=model_args
)

# Fine-tune the model
fbert_model_olid.train_model(train_data, eval_df=eval_data, output_dir='output/fbert_olid')

# Make predictions on new data
#predictions, raw_outputs = model.predict(["Example text 1", "Example text 2"])

# Evaluate the model
fbert_model_olid_result, fbert_model_olid_outputs, fbert_model_olid_wrong_predictions = fbert_model_olid.eval_model(eval_data)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at diptanu/fBERT and are newly initialized: ['bert.pooler.dense.bias', 'classifier.weight', 'bert.pooler.dense.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/622 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/622 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/110 [00:00<?, ?it/s]

### ensemble methods
hard majority voting:

In [35]:
# Load individual pre-trained transformer models
bert_model_olid = ClassificationModel("bert", "outputs/bert_olid")
hatebert_model_olid = ClassificationModel("bert", "outputs/hatebert_olid")
fbert_model_olid = ClassificationModel("bert", "outputs/fbert_olid")

# Make predictions using individual models
predictions1, _ = bert_model_olid.predict(test_data["text"].tolist())
predictions2, _ = hatebert_model_olid.predict(test_data["text"].tolist())
predictions3, _ = fbert_model_olid.predict(test_data["text"].tolist())

# Perform hard majority voting ensemble
hard_majority_predictions = np.array([np.argmax(np.bincount(votes)) for votes in zip(predictions1, predictions2, predictions3)])


  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

## hard_majority_predictions

In [40]:
predictions1, bert_model_olid_outputs = bert_model_olid.predict(test_data["text"].tolist())
predictions2, hatebert_model_olid_outputs = hatebert_model_olid.predict(test_data["text"].tolist())
predictions3, fbert_model_olid_outputs = fbert_model_olid.predict(test_data["text"].tolist())

probabilities1 = softmax(bert_model_olid_outputs, axis=1)
probabilities2 = softmax(hatebert_model_olid_outputs, axis=1)
probabilities3 = softmax(fbert_model_olid_outputs, axis=1)

average_probabilities = (probabilities1 + probabilities2 + probabilities3) / 3

# Get the class index with the highest average probability for each sample
ensemble_predictions = np.argmax(average_probabilities, axis=1)

  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/108 [00:00<?, ?it/s]

In [52]:
ensemble_predictions[:10]

array([0, 0, 1, 0, 0, 1, 1, 0, 0, 0])

In [53]:
average_probabilities[:10]

array([[0.5347011 , 0.4652989 ],
       [0.64832554, 0.35167446],
       [0.48807834, 0.51192166],
       [0.79534822, 0.20465178],
       [0.80888279, 0.19111721],
       [0.16924852, 0.83075148],
       [0.39325228, 0.60674772],
       [0.53436552, 0.46563448],
       [0.74151631, 0.25848369],
       [0.85195355, 0.14804645]])