In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json

import numpy as np
import pandas as pd

from data import _preprocess, COLUMNS
from eval import f1_conditional_selection, f1_majority, jaccard_sampled, jaccard_index

#### load test data

In [3]:
TEST_DATA_FILE_NAME = '../data/temporal/preprocessed_test.pkl'
test_df = _preprocess(pd.read_pickle(TEST_DATA_FILE_NAME))

# TODO fix this upstream somewhere
test_df['answer_pp_toxicity'] = test_df['answer_toxicity']

# extract language from filename
test_df['lang'] = test_df['filename'].str.extract(r'batch_(..)_\d+\.csv')
test_df['lang'].unique()


array(['nl', 'en', 'es', 'tr', 'ar', 'de'], dtype=object)

#### load gpt4o predictions

In [4]:
TEST_PRED_FOLDER = '../data/temporal/gpt4o_2025-01-22_12-03-30'

pred_df = pd.concat([pd.read_csv(os.path.join(TEST_PRED_FOLDER, f)) for f in os.listdir(TEST_PRED_FOLDER) if f.endswith('.csv')])

In [5]:
# pre process
df_answers = pred_df['answer'].apply(json.loads).apply(json.loads).apply(pd.Series).add_prefix('answer_')
df_answers = df_answers.where(df_answers.astype(bool), np.nan)  # removes empty dictionaries and replaces with NaN
pred_df = pd.concat([pred_df.drop(columns='answer'), df_answers], axis=1)

pred_df['answer_toxicity'] = pred_df['answer_trinary'].apply(lambda a: 'Yes/Maybe' if a['_Yes/Maybe'] else 'No')
pred_df['answer_counternarrative'] = pred_df['answer_trinary'].apply(lambda a: 'Yes' if a['_Counter-speech'] else 'No')

pred_pp_df = _preprocess(pred_df)


In [6]:
pred_df['answer_implTopic'] = pred_df['answer_implTopic'].str.extract(r'^(?:\.\.\.\s)?(\(..?.?\))')

## Evaluation

In [7]:
test_df.shape

(8738, 113)

In [8]:
def get_prediction(st_id, st_nr, comment_id, column):
    comment_df = pred_df[pred_df['comment_id'] == comment_id]
    assert len(comment_df) <= 1
    if len(comment_df) == 1:
        # answer = comment_df[f'answer_pp_{column}'].values[0]
        answer = comment_df[f'answer_{column}'].values[0]
        if COLUMNS[column].type == 'ml':
            # answer = comment_df[f'label_{column}'].values[0]
            answer = [answer[v] if v in answer else 0 for v in COLUMNS[column].values]
        elif column == 'hasImplication':
            answer = str([v for v in COLUMNS['hasImplication'].values if v in answer and bool(answer[v]) ])
        return answer
    else:
        return 'NA'

### Jaccard Evaluation

In [9]:
jaccard_df = jaccard_sampled(test_df, get_prediction)
jaccard_df.to_csv('roberta_eval_group2a.csv')

### Majority Evaluation

In [10]:
maj_results_df = f1_majority(test_df, get_prediction)
maj_results_df.to_csv('gpt4o_eval_group1.csv')

[autoreload of eval failed: Traceback (most recent call last):
  File "/home/stefan/Projects/toxic-reasoning/env-models/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/home/stefan/Projects/toxic-reasoning/env-models/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
             ^^^^^^^^^^^^^^
  File "/home/stefan/Projects/toxic-reasoning/env-models/lib/python3.12/importlib/__init__.py", line 131, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 866, in _exec
  File "<frozen importlib._bootstrap_external>", line 995, in exec_module
  File "<frozen importlib._bootstrap_external>", line 1133, in get_code
  File "<frozen importlib._bootstrap_external>", line 1063, in source_to_code
  File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed
  File "/home/stefan/Projects/toxic-r

### Conditional Evaluation

In [16]:
cond_results = []
for aggregation in ['none', 'max-score', 'random']:
    result_df = f1_conditional_selection(test_df, get_prediction, jaccard_index, aggregation=aggregation)
    result_df['aggregation'] = aggregation
    cond_results.append(result_df)

pd.concat(cond_results).to_csv('gpt4o_eval_group2b.csv')

subject subjectTokens
cond_scores (8738, 2) 1807
eval_predictions (8738,) 2845
subjectGroupType subjectTokens
cond_scores (8738, 2) 1807
eval_predictions (8738,) 8738
other otherTokens
cond_scores (8738, 2) 388
eval_predictions (8738,) 2845
implTopic implTopicTokens
cond_scores (8738, 2) 1963
eval_predictions (8738,) 2845
implPolarity implTopicTokens
cond_scores (8738, 2) 1963
eval_predictions (8738,) 2845
implTemporality implTopicTokens
cond_scores (8738, 2) 1963
eval_predictions (8738,) 8738
subject subjectTokens
cond_scores (8738, 2) 1807
eval_predictions (8738,) 2845
subjectGroupType subjectTokens
cond_scores (8738, 2) 1807
eval_predictions (8738,) 8738
other otherTokens
cond_scores (8738, 2) 388
eval_predictions (8738,) 2845
implTopic implTopicTokens
cond_scores (8738, 2) 1963
eval_predictions (8738,) 2845
implPolarity implTopicTokens
cond_scores (8738, 2) 1963
eval_predictions (8738,) 2845
implTemporality implTopicTokens
cond_scores (8738, 2) 1963
eval_predictions (8738,) 8738
su