# Preliminaries

In [1]:
from pathlib import Path

PATH_DATA = Path().resolve() / 'data'
PATH_DATA.mkdir(exist_ok=True)

In [2]:
import pandas as pd

df_train_olid_small = pd.read_csv('data/processed/olid-train-small.csv')
df_train_hasoc = pd.read_csv('data/processed/hasoc-train.csv')
df_test_olid = pd.read_csv('data/processed/olid-test.csv')

In [3]:
# !pip install simpletransformers

In [4]:
import pandas as pd
import random
import numpy as np
from simpletransformers.classification import ClassificationModel
from scipy.special import softmax

# Setting

In [5]:
# setting = 'in_domain'
setting = 'cross_domain'

In [6]:
if setting == 'in_domain':
    train = df_train_olid_small
elif setting == 'cross_domain':
    train = df_train_hasoc
test = df_test_olid

## Running it

In [7]:
print(setting)

cross_domain


In [8]:
debug = True

In [9]:
RANDOM_SEED = 89
random.seed(RANDOM_SEED)
SIMPLETRANSFORMERS_ARGS = {
    'reprocess_input_data':True,
    'overwrite_output_dir':True,
    'manual_seed': RANDOM_SEED
}
ALL_MODEL = ['GroNLP/hateBERT', 'diptanu/fBERT', 'bert-base-uncased']

if debug:
    train = train.head(20)
    test = test.head(10)
    ALL_MODEL = ['bert-base-uncased']

test_index = test.index

pred = [] # list of class outputs (predictions)
probabilities = [] # list of probability outputs
gold = [] # list of gold labels
index = [] # list of instance ids
machine = [] # list of model_name used to make predictions
    
for model_name in ALL_MODEL:
    model = ClassificationModel('bert', model_name, args=SIMPLETRANSFORMERS_ARGS, use_cuda=True)
    
    model.train_model(train)
    predictions, prob = model.predict(test.text.to_list())

    # Update variables
    gold.extend(test['labels'])
    pred.extend(predictions)
    machine.extend([model_name]*np.size(test_index))
    # here we apply the softmax function to the outputs to get probabilities
    # only P(hate) is kept, as P(not hate) = 1 - P(hate)
    probabilities.extend(softmax(prob,axis=1)[:,1]) 
    index.extend(test_index)

output = pd.DataFrame({
    'id': [test.id.to_list()[idx] for idx in index],
    'label': gold,
    'model': machine,
    'probabilities': probabilities,
    'predicted': pred,
})
output = pd.pivot_table(output, values= ['label','predicted','probabilities'], index=['id'], columns=['model'])

# output.to_csv("/content/gdrive/My Drive/outputhasoc.csv")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

After running on Colab, save in model_outputs/ to get:

```
model_outputs
├── trained
│   ├── cross_domain.csv
│   └── in_domain.csv
└── ...
```