# Preliminaries

In [1]:
RUN_ON_COLAB = False

In [2]:
if RUN_ON_COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')

In [3]:
from pathlib import Path

PATH_DATA = Path().resolve() / 'data'
PATH_DATA.mkdir(exist_ok=True)

In [4]:
import pandas as pd

if RUN_ON_COLAB:
    base_path = '/content/gdrive/My Drive/'
else:
    base_path = ''

df_train_olid_small = pd.read_csv(base_path + 'data/processed/olid-train-small.csv')
df_train_hasoc = pd.read_csv(base_path + 'data/processed/hasoc-train.csv')
df_test_olid = pd.read_csv(base_path + 'data/processed/olid-test.csv')

In [5]:
if RUN_ON_COLAB:
    import os
    os.system("pip install simpletransformers")

In [6]:
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import StratifiedKFold
from simpletransformers.classification import ClassificationModel
from scipy.special import softmax

# Setting

In [7]:
# setting = 'in_domain'
setting = 'cross_domain'

In [8]:
if setting == 'in_domain':
    train = df_train_olid_small
elif setting == 'cross_domain':
    train = df_train_hasoc

# $k$-fold Cross Validation

<img src="https://raw.githubusercontent.com/cleanlab/assets/master/cleanlab/pred_probs_cross_val.png" width="35%">
Image Source: https://raw.githubusercontent.com/cleanlab/assets/master/cleanlab/pred_probs_cross_val.png

Below a sample is shown for 4 folds with 4 instances.
Each fold then results in a prediction for a single instance, with the models being trained on the remaining instances.

In [9]:
NUMBER_OF_SPLITS = 4

In [10]:
df_temp = df_train_hasoc.head(20).sample(NUMBER_OF_SPLITS, random_state=1)
rskf = StratifiedKFold(n_splits=NUMBER_OF_SPLITS, shuffle=True, random_state=89)

for i, (train_index, test_index) in enumerate(rskf.split(df_temp['text'], df_temp['labels'])):
    train_df = df_temp.iloc[train_index]
    test_df = df_temp.iloc[test_index]
    
    print(f'Fold {i + 1}')
    print('Training on:')
    print(train_df.index.values)
    print('Predicting for:')
    print(test_df.index.values)

Fold 1
Training on:
[ 3  6 10]
Predicting for:
[16]
Fold 2
Training on:
[16  6 10]
Predicting for:
[3]
Fold 3
Training on:
[ 3 16 10]
Predicting for:
[6]
Fold 4
Training on:
[ 3 16  6]
Predicting for:
[10]


### Running it

In [11]:
print(setting)

cross_domain


In [12]:
debug = True

In [13]:
#5 fold cross validation output 

NUMBER_OF_SPLITS = 5
RANDOM_SEED = 89
random.seed(RANDOM_SEED)
SIMPLETRANSFORMERS_ARGS = {
    'reprocess_input_data':True,
    'overwrite_output_dir':True,
    'manual_seed': RANDOM_SEED
}
ALL_MODEL = ['GroNLP/hateBERT', 'diptanu/fBERT', 'bert-base-uncased']

if debug:
    train = train.head(20)
    NUMBER_OF_SPLITS = 2
    ALL_MODEL = ['bert-base-uncased']

rskf = StratifiedKFold(n_splits=NUMBER_OF_SPLITS, shuffle=True, random_state=RANDOM_SEED)
pred = [] # list of class outputs (predictions)
probabilities = [] # list of probability outputs
gold = [] # list of gold labels
index = [] # list of instance ids
machine = [] # list of model_name used to make predictions

for train_index, test_index in rskf.split(train['text'],train['labels']):
    train_df = train.iloc[train_index]
    test_df = train.iloc[test_index]
    
    for model_name in ALL_MODEL:
        model = ClassificationModel('bert', model_name, args=SIMPLETRANSFORMERS_ARGS, use_cuda=True)
    
        model.train_model(train_df) # Train on D\D_i
        predictions, prob = model.predict(test_df.text.to_list()) # Predict for D_i

        # Update variables
        gold.extend(test_df['labels'])
        pred.extend(predictions)
        machine.extend([model_name]*np.size(test_index))
        # here we apply the softmax function to the outputs to get probabilities
        # only P(hate) is kept, as P(not hate) = 1 - P(hate)
        probabilities.extend(softmax(prob,axis=1)[:,1]) 
        index.extend(test_index)

output = pd.DataFrame({
    'id': [train.id.to_list()[idx] for idx in index],
    'label': gold,
    'model': machine,
    'probabilities': probabilities,
    'predicted': pred,
})
output = pd.pivot_table(output, values= ['label','predicted','probabilities'], index=['id'], columns=['model'])

if not debug:
    output.to_csv(base_path + f"model_outputs/out_of_sample/{setting}.csv")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

After running on Colab, save in model_outputs/ to get:
```
model_outputs
├── out_of_sample
│   ├── cross_domain.csv
│   └── in_domain.csv
└── ...
```