In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
from functools import partial

import torch
import pandas as pd
from tqdm import tqdm

from transformers import AutoTokenizer

In [3]:
from data import COLUMNS, hierarchy_lookup, comtok_create_thread_text, _preprocess, convert_word_to_token_level
from eval import f1_by_annotator, f1_conditional_selection, f1_majority, jaccard_index, jaccard_sampled

## Initialize

In [4]:
# from model_xlm_roberta import XLMRobertaForToxicReasoning as ToxicReasoningModel
# MODEL_KEY = 'FacebookAI/xlm-roberta-base'
# MODEL = 'saved_model-0207-1705'
# MODEL_NAME = 'roberta'

from model_eurobert import EuroBertForToxicReasoning as ToxicReasoningModel
MODEL_KEY = 'EuroBERT/EuroBERT-610m'
MODEL = 'saved_eurobert_apr1_1401'
MODEL_NAME = 'eurobert'

In [5]:
COMMENT_TOKEN = "<COMMENT>"
tokenizer = AutoTokenizer.from_pretrained(MODEL_KEY)
tokenizer.add_special_tokens({'additional_special_tokens': [COMMENT_TOKEN]})

1

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
model = ToxicReasoningModel.from_pretrained(MODEL, device_map=device)
model.further_init(tokenizer.vocab['<COMMENT>'])
model = model.eval()

cuda


## Evaluation

In [7]:
TEST_DATA_FILE_NAME = '../data/temporal/preprocessed_test.pkl'

In [8]:
test_df = _preprocess(pd.read_pickle(TEST_DATA_FILE_NAME))
test_df.shape

(8738, 113)

In [9]:
# TODO fix this upstream somewhere
test_df['answer_pp_toxicity'] = test_df['answer_toxicity']

In [10]:
test_thread_df = comtok_create_thread_text(test_df, tokenizer, "<COMMENT>")

100%|██████████| 1649/1649 [00:15<00:00, 107.94it/s]

Split up 467 that were too long otherwise.
Skipped 91 that were still too long after.





In [11]:
test_thread_df.columns

Index(['st_id', 'text', 'ids', 'label_toxicity', 'label_counternarrative',
       'label_justInappropriate', 'label_hasImplication', 'label_subject',
       'label_subjectGroupType', 'label_subjectTokens', 'label_hasOther',
       'label_other', 'label_otherTokens', 'label_implTopic',
       'label_implTopicTokens', 'label_implPolarity', 'label_implStereotype',
       'label_implSarcasm', 'label_implTemporality', 'label_authorBelief',
       'label_authorPrefer', 'label_authorAccount', 'label_typicalBelief',
       'label_typicalPrefer', 'label_expertBelief', 'comment_i'],
      dtype='object')

In [12]:
# extract language from filename
test_df['lang'] = test_df['filename'].str.extract(r'batch_(..)_\d+\.csv')
test_df['lang'].unique()

array(['nl', 'en', 'es', 'tr', 'ar', 'de'], dtype=object)

### Use model to predict on test set

In [13]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [14]:
def convert_prediction(name, value):
    value = value.squeeze().detach().cpu()

    rev_map = COLUMNS[name].reverse_map
    if any(type(k) == float for k in rev_map.keys()):
        # this is a binary field where we want a 5-valued ordinal
        p = sigmoid(value.item())
        if   p < 0.15: value = 'Very low'
        elif p < 0.35: value = 'Low'
        elif p < 0.65: value = 'Medium'
        elif p < 0.85: value = 'High'
        elif p < 1:    value = 'Very high'
        else: value = 'NA'
    elif len(value.shape) == 0:
        # this is a binary field where we want to threshold
        int_value = 0 if value < 0.5 else 1
        value = rev_map[int_value]
    elif COLUMNS[name].type == 'mc':
        # multi-class
        int_value = value.argmax().squeeze().item()
        value = rev_map[int_value]
    else:
        # multi-label
        # value = [v for p, v in zip(value.tolist(), COLUMNS[name].values) if sigmoid(p) > 0.5]
        value = value.tolist()
    return value

st_preds = {}

for st_id, t_df_1 in tqdm(test_df.groupby('st_id')):
    comment_ids = t_df_1.sort_values('st_nr')['comment_id'].unique().tolist()

    t_df_2 = test_thread_df.loc[test_thread_df['st_id'] == st_id]
    thread_texts = t_df_2[['ids', 'text']].apply(lambda r: (tuple(r['ids']), r['text']), axis=1).unique()

    # contains the index of the comment that each word is a part of
    wl_comment_indices = t_df_2['comment_i'].apply(tuple).unique()

    all_preds = {}  # the thread might have been split to fit in the model
    for (ids, thread_text), wl_comment_i in zip(thread_texts, wl_comment_indices):
        tokens = tokenizer(thread_text, return_tensors='pt', is_split_into_words=False)
        preds, _ = model(**{k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in tokens.items()})

        # == convert token-level predictions to word-level ==
        comment_word_pred = [{key: [] for key in preds.keys() if key.endswith('Tokens')} for _ in range(len(ids))]
        # get the index of the comment that each token is a part of
        tl_comment_indices = convert_word_to_token_level(tokens.encodings[0], thread_text, wl_comment_i, is_boolean=False)

        for key in [k for k in preds.keys() if k.endswith('Tokens')]:
            assert len(tl_comment_indices) == len(preds[key].squeeze())

            word_preds = [[] for _ in range(max(w for w in tokens.encodings[0].word_ids if w is not None)+1)]
            for (tok_idx, comment_idx), logit in zip(enumerate(tl_comment_indices), preds[key].squeeze()):
                word_i = tokens.encodings[0].token_to_word(tok_idx)
                if word_i is not None:
                    word_preds[word_i].append(logit.item())

            for logits, comment_idx in zip(word_preds, wl_comment_i):
                # average across word
                if comment_idx is None or comment_idx < 0:
                    continue
                comment_word_pred[comment_idx][key].append(sum(logits)/len(logits) if len(logits) > 0 else None)
        # == == == == ==

        for i, (st_nr, id) in enumerate(ids):
            all_preds[(st_nr, id)] = {
                key: convert_prediction(key, preds[key][i])
                for key in preds.keys() if not key.endswith('Tokens')
            }

            comment_words = t_df_1.loc[t_df_1['comment_id'] == id, 'comment_body_tokens'].unique()[0].split()
            assert len(comment_words) == len(comment_word_pred[i]['subjectTokens'])
            word_level_fields = {
                key: {
                    w: logit > 0 if logit is not None else False
                    for _, (w, logit) in enumerate(zip(comment_words, comment_word_pred[i][key]))
                }
                for key in comment_word_pred[i].keys()
            }
            all_preds[(st_nr, id)] |= word_level_fields

    st_preds[st_id] = all_preds

100%|██████████| 547/547 [00:33<00:00, 16.54it/s]


In [15]:
# test_df.to_csv('eval_outputs/test_data.csv')
# test_df.shape

In [16]:
HIERARCHY_LOOKUP = hierarchy_lookup()
HIERARCHY_LOOKUP.keys()

dict_keys(['toxicity', 'counternarrative', 'justInappropriate', 'hasImplication', 'authorBelief', 'authorPrefer', 'authorAccount', 'typicalBelief', 'typicalPrefer', 'expertBelief', 'implTopic', 'implTopicTokens', 'implPolarity', 'implTemporality', 'implStereotype', 'hasOther', 'other', 'otherTokens', 'subject', 'subjectGroupType', 'subjectTokens'])

In [17]:
def get_prediction(st_id, st_nr, comment_id, column, optimistic=False):
    st_pred = st_preds[st_id]
    if (st_nr, comment_id) not in st_pred:
        # no predictions for this comment (probably didn't fit into the model)
        return 'NA'

    pred = st_pred[(st_nr, comment_id)]
    if optimistic:
        return pred[column]

    hier_node = HIERARCHY_LOOKUP[column]
    if hier_node.parent is not None and not all(
        pred[c] == COLUMNS[c].reverse_map[HIERARCHY_LOOKUP[c].conditions[HIERARCHY_LOOKUP[c].columns.index(c)]]
        for c in hier_node.parent.ancestor_values()
    ):
        # print([
        #     (pred[c], COLUMNS[c].reverse_map[HIERARCHY_LOOKUP[c].conditions[HIERARCHY_LOOKUP[c].columns.index(c)]])
        #     for c in hier_node.parent.ancestor_values()
        # ])
        return 'NA'

    return pred[column]


### Majority evaluation

In [18]:
maj_results = []
for optimism in [True, False]:
    maj_results_df = f1_majority(test_df, partial(get_prediction, optimistic=optimism))
    maj_results_df['optmisitic'] = optimism
    maj_results.append(maj_results_df)

pd.concat(maj_results).to_csv(f'eval_outputs/{MODEL_NAME}_eval_group1.csv')

### By-annotator evaluation

Here we measure how much the model agrees with each annotator

In [19]:
# results_df = f1_by_annotator(test_df, get_prediction)
# results_df.to_csv(f'{MODEL_NAME}_by_annotator.csv')

### Conditional Evaluation

In [20]:
cond_results = []
for optimism in [True, False]:
    for aggregation in ['none', 'max-score', 'random']:
        results_df = f1_conditional_selection(test_df, partial(get_prediction, optimistic=optimism), jaccard_index, aggregation=aggregation)
        results_df['aggregation'] = aggregation
        results_df['optimistic'] = optimism
        cond_results.append(results_df)

pd.concat(cond_results).to_csv(f'eval_outputs/{MODEL_NAME}_eval_group2b.csv')

subject subjectTokens
cond_scores (8738, 2) 1807
eval_predictions (8738,) 8738
subjectGroupType subjectTokens
cond_scores (8738, 2) 1807
eval_predictions (8738,) 8738
other otherTokens
cond_scores (8738, 2) 388
eval_predictions (8738,) 8738
implTopic implTopicTokens
cond_scores (8738, 2) 1963
eval_predictions (8738,) 8738
implPolarity implTopicTokens
cond_scores (8738, 2) 1963
eval_predictions (8738,) 8738
implTemporality implTopicTokens
cond_scores (8738, 2) 1963
eval_predictions (8738,) 8738
subject subjectTokens
cond_scores (8738, 2) 1807
eval_predictions (8738,) 8738
subjectGroupType subjectTokens
cond_scores (8738, 2) 1807
eval_predictions (8738,) 8738
other otherTokens
cond_scores (8738, 2) 388
eval_predictions (8738,) 8738
implTopic implTopicTokens
cond_scores (8738, 2) 1963
eval_predictions (8738,) 8738
implPolarity implTopicTokens
cond_scores (8738, 2) 1963
eval_predictions (8738,) 8738
implTemporality implTopicTokens
cond_scores (8738, 2) 1963
eval_predictions (8738,) 8738
su

### Jaccard Evaluation

In [21]:
jacc_results = []
for optimism in [True, False]:
    jaccard_df = jaccard_sampled(test_df, partial(get_prediction, optimistic=optimism))
    jaccard_df['optimistic'] = optimism
    jacc_results.append(jaccard_df)
pd.concat(jacc_results).to_csv(f'eval_outputs/{MODEL_NAME}_eval_group2a.csv')

answer_pp_implDetected
False    6711
True     2027
Name: count, dtype: int64


100%|██████████| 50/50 [00:01<00:00, 43.16it/s]
100%|██████████| 50/50 [00:00<00:00, 239.57it/s]
100%|██████████| 50/50 [00:01<00:00, 37.38it/s]
100%|██████████| 50/50 [00:01<00:00, 44.45it/s]
100%|██████████| 50/50 [00:00<00:00, 81.70it/s] 
100%|██████████| 50/50 [00:01<00:00, 43.28it/s]
100%|██████████| 50/50 [00:02<00:00, 22.82it/s]
100%|██████████| 50/50 [00:00<00:00, 59.20it/s]
100%|██████████| 50/50 [00:01<00:00, 25.18it/s]
100%|██████████| 50/50 [00:02<00:00, 23.08it/s]
100%|██████████| 50/50 [00:00<00:00, 98.32it/s] 
100%|██████████| 50/50 [00:02<00:00, 22.53it/s]
100%|██████████| 50/50 [00:01<00:00, 36.03it/s]
100%|██████████| 50/50 [00:00<00:00, 94.20it/s] 
100%|██████████| 50/50 [00:01<00:00, 35.77it/s]
100%|██████████| 50/50 [00:01<00:00, 45.81it/s]
100%|██████████| 50/50 [00:00<00:00, 194.45it/s]
100%|██████████| 50/50 [00:00<00:00, 64.42it/s]


answer_pp_implDetected
False    6711
True     2027
Name: count, dtype: int64


100%|██████████| 50/50 [00:01<00:00, 37.60it/s]
100%|██████████| 50/50 [00:00<00:00, 242.43it/s]
100%|██████████| 50/50 [00:01<00:00, 42.32it/s]
100%|██████████| 50/50 [00:01<00:00, 37.28it/s]
100%|██████████| 50/50 [00:00<00:00, 75.92it/s]
100%|██████████| 50/50 [00:01<00:00, 42.82it/s]
100%|██████████| 50/50 [00:02<00:00, 22.88it/s]
100%|██████████| 50/50 [00:00<00:00, 69.55it/s]
100%|██████████| 50/50 [00:02<00:00, 22.90it/s]
100%|██████████| 50/50 [00:02<00:00, 21.96it/s]
100%|██████████| 50/50 [00:00<00:00, 96.79it/s] 
100%|██████████| 50/50 [00:02<00:00, 22.54it/s]
100%|██████████| 50/50 [00:01<00:00, 40.26it/s]
100%|██████████| 50/50 [00:00<00:00, 136.66it/s]
100%|██████████| 50/50 [00:01<00:00, 35.51it/s]
100%|██████████| 50/50 [00:00<00:00, 65.77it/s]
100%|██████████| 50/50 [00:00<00:00, 123.52it/s]
100%|██████████| 50/50 [00:00<00:00, 52.63it/s]
