In [2]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

from scipy.special import softmax

In [3]:
import re
import string

def remove_emojis(text):
    return ''.join(char for char in text if char in (string.ascii_letters + string.digits + string.punctuation + ' '))

def remove_tags(text):
    return re.sub(r'@\w+', '', text)

def remove_multispace(text):
    return re.sub(r'[\s]+|[\t]|[.,"\']', ' ', text)

def preprocess_pipeline(text):

    text = remove_emojis(text)
    text = remove_tags(text)
    text = remove_multispace(text)

    return text

In [4]:
olid = pd.read_csv('data/olid-train-small.csv')
olid['text'] = olid['text'].apply(preprocess_pipeline)

hasoc = pd.read_csv('data/hasoc-train.csv')
hasoc['text'] = hasoc['text'].apply(preprocess_pipeline)

test_data = pd.read_csv('data/olid-test.csv')
test_data['text'] = test_data['text'].apply(preprocess_pipeline)

In [5]:
#experimental setup:
model_args = ClassificationArgs()
model_args.train_batch_size = 8
model_args.num_train_epochs = 3
model_args.learning_rate = 1e-4
model_args.warmup_ratio = 0.1
# model_args.evaluate_during_training_steps = 20
# model_args.evaluate_during_training = True
# model_args.evaluate_during_training_verbose = True
# model_args.use_early_stopping = True
# model_args.early_stopping_patience = 30 #this was increased to 20, not 10 as in fbret paper
#utilities:

model_args.save_eval_checkpoints = False
model_args.overwrite_output_dir = True
model_args.use_multiprocessing=False
model_args.use_multiprocessing_for_evaluation=False

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [6]:

# Initialize base models
bert_model_olid = ClassificationModel("bert", "bert-base-uncased", args=model_args, use_cuda=False)
hatebert_model_olid = ClassificationModel('bert', 'GroNLP/hateBERT', args=model_args, use_cuda=False)
fbert_model = ClassificationModel('bert', 'diptanu/fBERT', args=model_args, use_cuda=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at diptanu/fBERT and are newly initialized: ['bert.pooler.dense.bias', 'classifier.weight', 'bert.pooler.dense.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.model_selection import StratifiedKFold
from simpletransformers.classification import ClassificationModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Data 
olid = olid
X_base = olid['text']
y = olid['labels']

X_test = test_data['text']
y_test = test_data['labels']

# Initialize meta-models
meta_model_rf = RandomForestClassifier()
meta_model_lr = LogisticRegression()

# Initialize StratifiedKFold cross-validator
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# Initialize arrays to store meta-features (base predictions) and true labels
meta_features = []
true_labels = []

# Perform StratifiedKFold cross-validation
for train_index, val_index in skf.split(X_base, y):
    X_train, X_val = X_base[train_index], X_base[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Train base models
    train = pd.DataFrame({'text':X_train, 'labels':y_train})
    bert_model_olid.train_model(train[['text','labels']])
    hatebert_model_olid.train_model(train[['text','labels']])
    fbert_model.train_model(train[['text','labels']])

    # Predict with base models on validation set
    preds_bert = bert_model_olid.predict(X_val.tolist())[0]
    preds_hatebert = hatebert_model_olid.predict(X_val.tolist())[0]
    preds_roberta = fbert_model.predict(X_val.tolist())[0]

    # Stack base model predictions
    stacked_features = np.column_stack((preds_bert, preds_hatebert, preds_roberta))

    # Store true labels and meta-features
    true_labels.extend(y_val)
    meta_features.extend(stacked_features)

# Train meta-models on the stacked features
meta_model_rf.fit(meta_features, true_labels)
meta_model_lr.fit(meta_features, true_labels)

# Assuming you have test data (X_test)
# Predict with base models on test set
base_preds_bert = bert_model_olid.predict(X_test.tolist())[0]
base_preds_hatebert = hatebert_model_olid.predict(X_test.tolist())[0]
base_preds_roberta = fbert_model.predict(X_test.tolist())[0]

# Stack base model predictions for test set
stacked_test_features = np.column_stack((base_preds_bert, base_preds_hatebert, base_preds_roberta))

# Use meta-models for final prediction
ensemble_preds_rf = meta_model_rf.predict(stacked_test_features)
ensemble_preds_lr = meta_model_lr.predict(stacked_test_features)

# For Random Forest Ensemble
classification_rf = classification_report(y_test, ensemble_preds_rf)
print("Classification Report for Random Forest Ensemble:")
print(classification_rf)

# For Logistic Regression Ensemble
classification_lr = classification_report(y_test, ensemble_preds_lr)
print("Classification Report for Logistic Regression Ensemble:")
print(classification_lr)
