In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import itertools
import os
import warnings

import numpy as np
import pandas as pd

from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters

from data import _preprocess, COLUMNS

### LOAD TEST SET

In [3]:
TEST_DATA_FILE_NAME = '../data/temporal/preprocessed_test.pkl'
test_df = _preprocess(pd.read_pickle(TEST_DATA_FILE_NAME))

# TODO fix this upstream somewhere
test_df['answer_pp_toxicity'] = test_df['answer_toxicity']

# extract language from filename
test_df['lang'] = test_df['filename'].str.extract(r'batch_(..)_\d+\.csv')
test_df['lang'].unique()

array(['nl', 'en', 'es', 'tr', 'ar', 'de'], dtype=object)


### LOAD GPT ANNOTATIONS

In [4]:
TEST_PRED_FOLDER = '../data/temporal/gpt4o_2025-01-22_12-03-30'

gpt_df = pd.concat([pd.read_csv(os.path.join(TEST_PRED_FOLDER, f)) for f in os.listdir(TEST_PRED_FOLDER) if f.endswith('.csv')])

# expand answer field
df_answers = gpt_df['answer'].apply(json.loads).apply(json.loads).apply(pd.Series).add_prefix('answer_')
df_answers = df_answers.where(df_answers.astype(bool), np.nan)  # removes empty dictionaries and replaces with NaN
gpt_df = pd.concat([gpt_df.drop(columns='answer'), df_answers], axis=1)

# split up trinary
gpt_df['answer_toxicity'] = gpt_df['answer_trinary'].apply(lambda a: 'Yes/Maybe' if a['_Yes/Maybe'] else 'No')
gpt_df['answer_counternarrative'] = gpt_df['answer_trinary'].apply(lambda a: 'Yes' if a['_Counter-speech'] else 'No')

# further preprocessing
gpt_df = _preprocess(gpt_df)
gpt_df['workerid'] = 'gpt4o'
# pred_df['answer_implTopic'] = pred_df['answer_implTopic'].str.extract(r'^(?:\.\.\.\s)?(\(..?.?\))')


In [5]:
# Langauge misidentifications (See https://docs.google.com/spreadsheets/d/1K7AHawqNgLOV4SryJGr_BbQFwcB8K4gPOR9Xattx5Q4)
MISIDENTIFIED_THREAD_IDS = {
    '60761694404071785', '5687396952074376', '48407333198278932', '53501753679094372',
    '54288785178874084', '66024864922700717', '71880017305366941', '16353993833502616',
    '43223883605944201', '50032915589739384', '52097527239380956',
}
gpt_df = gpt_df[~gpt_df['st_id'].isin(MISIDENTIFIED_THREAD_IDS)]

### COMBINE

In [6]:
df = pd.concat([test_df, gpt_df])
df.shape

(11630, 115)

In [7]:
df.to_csv('agreement_data.csv')

# Pre-processing

Add field for 'did someone fill out an implication'


In [8]:
impl1 = df['answer_pp_englishImplication'].str.len() > 0
impl2 = df['answer_pp_implication'].str.len() > 0
impl3 = df['answer_pp_nativeImplication'].str.len() > 0
df['nonEmptyImplication'] = impl1 | impl2 | impl3

# Export for manual inspection

In [9]:
df.shape

(11630, 116)

In [10]:
# df.pivot(
#     columns=['workerid'], index=['comment_id'], values=['answer_implication']
# ).to_csv('implications.csv')
# df.pivot(
#     columns=['workerid'], index=['st_id', 'st_nr', 'comment_body'], values=['answer_toxicity']
# ).to_csv('first_question_a.csv')
# df.pivot(
#     columns=['workerid'], index=['st_id', 'st_nr', 'comment_body'], values=['answer_counternarrative']
# ).to_csv('first_question_b.csv')
# df.pivot(
#     columns=['workerid'], index=['comment_id'], values=['answer_justInappropriate']
# ).to_csv('second_question.csv')
# df.pivot(
#     columns=['workerid', 'comment_body'], index=['comment_id'], values=['answer_implication']
# ).to_csv('implications_with_comments.csv')

# ...

In [11]:
def conditional_calculation(all_calcs, do_calc):
    for calc in all_calcs:
        suitable_comment_ids = set(df['comment_id'].tolist())
        for args in calc['conditions']:
            condition, *args = args
            if condition == 'full_agreement':
                # create list of comment ids where each annotator has the same answer (and it is not NaN)
                column = args
                agreed = df.groupby(by=['comment_id'])['answer_pp_' + column].nunique() == 1
                comment_ids = set(agreed.index[agreed])
            elif condition == 'full_agreement_on_value':
                # create list of comment ids where each annotator has a specific answer
                column, value = args
                def mapping(x):
                    if column not in COLUMNS:
                        return x
                    return COLUMNS[column].apply_fn(x)

                correct = df.groupby(by=['comment_id'])['answer_pp_' + column].apply(
                    lambda grp: (grp.map(mapping) == value).all()
                )
                # print(correct.index[correct])
                comment_ids = set(correct.index[correct])
                # print(comment_ids)
            elif condition == 'column_in_list':
                column, values = args
                comment_ids = set(df[df[column].isin(values)]['comment_id'].tolist())
            else:
                raise ValueError()

            suitable_comment_ids = suitable_comment_ids & comment_ids

        # print(f'nr of suitable ids: {len(suitable_comment_ids)}')
        # comments that meet conditions
        suitable_df = df[df['comment_id'].isin(suitable_comment_ids)].sort_values('comment_id')
        # print(f'nr of suitable rows: {suitable_df.shape}')

        for col in calc['to_calculate']:
            # print(calc['prefix'])
            do_calc(suitable_df, col, calc['prefix'], calc['labels'])

# Calculate Agreements

In [12]:
df.groupby('filename')['workerid'].apply(lambda grp: grp.value_counts().shape).reset_index(name='Nr')

Unnamed: 0,filename,Nr
0,batch_ar_1.csv,"(4,)"
1,batch_ar_10.csv,"(4,)"
2,batch_ar_11.csv,"(4,)"
3,batch_ar_12.csv,"(4,)"
4,batch_ar_13.csv,"(4,)"
...,...,...
110,batch_tr_5.csv,"(4,)"
111,batch_tr_6.csv,"(4,)"
112,batch_tr_7.csv,"(4,)"
113,batch_tr_8.csv,"(4,)"


In [13]:
df.groupby('filename')['workerid'].apply(lambda grp: grp.unique()).reset_index()

Unnamed: 0,filename,workerid
0,batch_ar_1.csv,"[husseinsarrar, ilham, nada, gpt4o]"
1,batch_ar_10.csv,"[husseinsarrar, ilham, nada, gpt4o]"
2,batch_ar_11.csv,"[husseinsarrar, ilham, nada, gpt4o]"
3,batch_ar_12.csv,"[husseinsarrar, ilham, nada, gpt4o]"
4,batch_ar_13.csv,"[husseinsarrar, ilham, nada, gpt4o]"
...,...,...
110,batch_tr_5.csv,"[alp, doruk, selman, gpt4o]"
111,batch_tr_6.csv,"[alp, doruk, selman, gpt4o]"
112,batch_tr_7.csv,"[alp, doruk, selman, gpt4o]"
113,batch_tr_8.csv,"[alp, doruk, selman, gpt4o]"


In [194]:
LANG_BATCHES = {
    'en': [f'batch_en_{i}.csv' for i in range(4,22)],
    'nl': [f'batch_nl_{i}.csv' for i in set(range(1,18)) - {3, 16}],
    'de': [f'batch_de_{i}.csv' for i in range(1,21)],
    'es': [f'batch_es_{i}.csv' for i in range(1,20)],
    'tr': [f'batch_tr_{i}.csv' for i in range(1,20)],
    'ar': [f'batch_ar_{i}.csv' for i in range(1,20)],
}

In [227]:
# define all the inter-annotator agreements we want to calculate
IAA = [
    dict(
        prefix=f'IAA_toxicity_{key}_',
        conditions=[('column_in_list', 'filename', batches)],
        to_calculate=['toxicity'],
        labels=['Yes/Maybe', 'No']
    )
    for key, batches in LANG_BATCHES.items()
] + [
    dict(
        prefix=f'IAA_counternarrative_{lang}_',
        conditions=[('column_in_list', 'filename', batches)],
        to_calculate=['counternarrative'],
        labels=COLUMNS['counternarrative'].values,
    ) for lang, batches in LANG_BATCHES.items()
] + [
    dict(
        prefix=f'IAA_justInappropriate_{lang}_',
        conditions=[
            ('column_in_list', 'filename', batches),
            ('full_agreement_on_value', 'toxicity', 'Yes/Maybe'),
            ('full_agreement_on_value', 'counternarrative', 'No'),
        ],
        to_calculate=['justInappropriate'],
        labels=COLUMNS['justInappropriate'].values,
    ) for lang, batches in LANG_BATCHES.items()
# ] + [
#     dict(
#         prefix=f'IAA_other_{lang}_',
#         conditions=[
#             ('column_in_list', 'filename', batches),
#             ('full_agreement_on_value', 'implDetected', True),
#             ('full_agreement_on_value', 'hasOther', '[]'),
#         ],
#         to_calculate=['other'],
#         labels=COLUMNS['other'].values,
#     ) for lang, batches in LANG_BATCHES.items()
] + [
    dict(
        prefix=f'IAA_subject_{lang}_',
        conditions=[
            ('column_in_list', 'filename', batches),
            ('full_agreement_on_value', 'implDetected', True),
        ],
        to_calculate=['subject'],
        labels=COLUMNS['subject'].values,
    ) for lang, batches in LANG_BATCHES.items()
] + [
    dict(
        prefix=f'IAA_implPolarity_{lang}_',
        conditions=[
            ('column_in_list', 'filename', batches),
            ('full_agreement_on_value', 'implDetected', True),
        ],
        to_calculate=['implPolarity'],
        labels=COLUMNS['implPolarity'].values,
    ) for lang, batches in LANG_BATCHES.items()
] + [
    dict(
        prefix=f'IAA_implTopic_{lang}_',
        conditions=[
            ('column_in_list', 'filename', batches),
            ('full_agreement_on_value', 'implDetected', True),
        ],
        to_calculate=['implTopic'],
        labels=COLUMNS['implTopic'].values,
    ) for lang, batches in LANG_BATCHES.items()
] + [
    dict(
        prefix=f'IAA_implStereotype_{lang}_',
        conditions=[
            ('column_in_list', 'filename', batches),
            ('full_agreement_on_value', 'implDetected', True),
        ],
        to_calculate=['implStereotype'],
        labels=COLUMNS['implStereotype'].values,
    ) for lang, batches in LANG_BATCHES.items()
] + [
    dict(
        prefix=f'IAA_implSarcasm_{lang}_',
        conditions=[
            ('column_in_list', 'filename', batches),
            ('full_agreement_on_value', 'implDetected', True),
        ],
        to_calculate=['implSarcasm'],
        labels=COLUMNS['implSarcasm'].values,
    ) for lang, batches in LANG_BATCHES.items()
# ] + [
#     dict(
#         prefix=f'IAA_{col}_{lang}_',
#         conditions=[
#             ('column_in_list', 'filename', batches),
#             ('full_agreement_on_value', 'implDetected', True),
#         ],
#         to_calculate=[col],
#         labels=COLUMNS[col].values,
#     )
#     for lang, batches in LANG_BATCHES.items()
#     for col in ['authorBelief', 'authorPrefer', 'authorAccount',
#                 'typicalBelief', 'typicalPrefer', 'expertBelief']
]


In [228]:
all_results = []

def iaa_calculation(suitable_df, col, key, labels):
    col = 'answer_pp_' + col
    name = key + col
    results = []

    print(name)

    # calculate cohen's kappa between pairs of annotators
    workers = suitable_df['workerid'].unique().tolist()
    for w1, w2 in itertools.combinations(workers, r=2):
        w1_df = suitable_df.loc[suitable_df['workerid']==w1]
        w2_df = suitable_df.loc[suitable_df['workerid']==w2]
        valid_comment_ids = (set(w1_df.loc[w1_df[col].notna(), 'comment_id'].unique())
                           & set(w2_df.loc[w2_df[col].notna(), 'comment_id'].unique()))
        w1_df = w1_df.loc[w1_df['comment_id'].isin(valid_comment_ids)]
        w2_df = w2_df.loc[w2_df['comment_id'].isin(valid_comment_ids)]

        y1 = w1_df[col]
        y2 = w2_df[col]
        if len(y1) < len(suitable_df['comment_id'].unique()):
            print(f'WARNING reduced support cohen {w1} - {w2}:', len(y1), len(suitable_df['comment_id'].unique()))

        try:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                value = cohen_kappa_score(y1.tolist(), y2.tolist(), labels=labels)

            if pd.isna(value):
                print(f'WARNING NaN for cohen {w1} - {w2}:', y1.unique(), y2.unique())

            results.append({'key': key, 'col': col, 'score': 'cohen', 'w1': w1, 'w2': w2, 'support': len(y1), 'value': value})
        except ValueError as e:
            print(f'ERROR   {w1} ({len(y1)}), {w2} ({len(y2)})')
            ids1 = set(suitable_df.loc[y1.index]['comment_id'].unique())
            ids2 = set(suitable_df.loc[y2.index]['comment_id'].unique())
            diff1 = ids1 - ids2
            print(f'{w1} - {w2}:', suitable_df.loc[suitable_df['comment_id'].isin(diff1), ['filename', 'st_id']])
            diff2 = ids2 - ids1
            print(f'{w2} - {w1}:',suitable_df.loc[suitable_df['comment_id'].isin(diff2), ['filename', 'st_id']])
            raise e

    # calculate fleiss' kappa
    answers_by_comment = suitable_df.pivot(columns=['workerid'], index=['comment_id'], values=[col])
    def fleiss(answers_by_comment, name):
        valid = answers_by_comment.notna().all(axis=1)
        if valid.sum() < len(valid):
            print('WARNING reduced support fleiss:', valid.sum(), len(valid))

        table = answers_by_comment.loc[valid].to_numpy()
        table, labels = aggregate_raters(table)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            value = fleiss_kappa(table)
        results.append({'key': key, 'col': col, 'score': f'fleiss-{name}', 'value': value, 'support': valid.sum()})
    fleiss(answers_by_comment, 'all')
    fleiss(answers_by_comment.drop(columns=[(col, 'gpt4o')]), 'w/o gpt')

    # write answers to csv for manual checking
    # answers_by_comment.to_csv('answers_by_comment_' + name + '.csv')

    # pprint.pprint(results)
    # print('----')

    all_results.extend(results)


conditional_calculation(IAA, iaa_calculation)
results_df = pd.DataFrame.from_dict(all_results)
results_df.to_csv('agreement_scores.csv')


IAA_toxicity_en_answer_pp_toxicity
IAA_toxicity_nl_answer_pp_toxicity
IAA_toxicity_de_answer_pp_toxicity
IAA_toxicity_es_answer_pp_toxicity
IAA_toxicity_tr_answer_pp_toxicity
IAA_toxicity_ar_answer_pp_toxicity
IAA_counternarrative_en_answer_pp_counternarrative
IAA_counternarrative_nl_answer_pp_counternarrative
IAA_counternarrative_de_answer_pp_counternarrative
IAA_counternarrative_es_answer_pp_counternarrative
IAA_counternarrative_tr_answer_pp_counternarrative
IAA_counternarrative_ar_answer_pp_counternarrative
IAA_justInappropriate_en_answer_pp_justInappropriate
IAA_justInappropriate_nl_answer_pp_justInappropriate
IAA_justInappropriate_de_answer_pp_justInappropriate
IAA_justInappropriate_es_answer_pp_justInappropriate
IAA_justInappropriate_tr_answer_pp_justInappropriate
IAA_justInappropriate_ar_answer_pp_justInappropriate
IAA_subject_en_answer_pp_subject
IAA_subject_nl_answer_pp_subject
IAA_subject_de_answer_pp_subject
IAA_subject_es_answer_pp_subject
IAA_subject_tr_answer_pp_subject
I

# ...

In [197]:
from sklearn.metrics import precision_recall_fscore_support

Add majority vote

In [208]:
def scores_calculation(suitable_df, col, key, labels):
    majority = suitable_df[suitable_df['workerid']!='gpt4o'].groupby(by=['comment_id'])[f'answer_{col}'].apply(lambda x: x.mode().iloc[0]).rename('majority')
    gpt4o = suitable_df[suitable_df['workerid']=='gpt4o'].set_index('comment_id')[f'answer_{col}'].rename('gpt4o')
    both = pd.merge(majority, gpt4o, left_index=True, right_index=True, how='outer')

    both.to_csv('majority_vs_gpt4o.csv')

    y_true = both['majority'].tolist()
    y_pred = both['gpt4o'].tolist()

    result = precision_recall_fscore_support(y_true, y_pred, labels=labels)
    # result = precision_recall_fscore_support(y_true, y_pred)
    for key, val in zip(['precis', 'recall', 'fscore', 'suppor'], result):
        print(f'{key}: {val}')

    print('---------')

conditional_calculation(IAA, scores_calculation)

precis: [0.81884058 0.71612903]
recall: [0.56218905 0.89878543]
fscore: [0.66666667 0.79712747]
suppor: [201 247]
---------
precis: [0.68269231 0.80988593]
recall: [0.58677686 0.86585366]
fscore: [0.63111111 0.83693517]
suppor: [121 246]
---------
precis: [0.95419847 0.64      ]
recall: [0.48076923 0.97560976]
fscore: [0.63938619 0.77294686]
suppor: [260 246]
---------
precis: [0.94219653 0.68944099]
recall: [0.61977186 0.95689655]
fscore: [0.74770642 0.80144404]
suppor: [263 232]
---------
precis: [0.89361702 0.8129771 ]
recall: [0.77419355 0.91416309]
fscore: [0.82962963 0.86060606]
suppor: [217 233]
---------
precis: [0.76303318 0.82372881]
recall: [0.75586854 0.82935154]
fscore: [0.75943396 0.82653061]
suppor: [213 293]
---------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


precis: [0.         0.         1.         0.95652174 0.        ]
recall: [0. 0. 1. 1. 0.]
fscore: [0.         0.         1.         0.97777778 0.        ]
suppor: [ 0  1  1 22  0]
---------
precis: [0.         0.71428571 1.         0.84615385 0.        ]
recall: [0.         0.71428571 1.         0.84615385 0.        ]
fscore: [0.         0.71428571 1.         0.84615385 0.        ]
suppor: [ 0  7  3 13  0]
---------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


precis: [0.     0.     0.     0.9375 0.    ]
recall: [0.     0.     0.     0.9375 0.    ]
fscore: [0.     0.     0.     0.9375 0.    ]
suppor: [ 0  1  1 16  0]
---------
precis: [1.         0.5        0.66666667 0.96969697 0.        ]
recall: [1.         1.         0.57142857 0.91428571 0.        ]
fscore: [1.         0.66666667 0.61538462 0.94117647 0.        ]
suppor: [ 1  2  7 35  0]
---------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


precis: [0. 0. 0. 1. 0.]
recall: [0. 0. 0. 1. 0.]
fscore: [0. 0. 0. 1. 0.]
suppor: [ 0  0  0 16  0]
---------
precis: [0.         0.         0.5        0.76190476 0.        ]
recall: [0.         0.         0.5        0.94117647 0.        ]
fscore: [0.         0.         0.5        0.84210526 0.        ]
suppor: [ 1  4  2 17  0]
---------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


precis: [0.875 0.    0.   ]
recall: [1. 0. 0.]
fscore: [0.93333333 0.         0.        ]
suppor: [21  2  1]
---------
precis: [1. 0. 0.]
recall: [1. 0. 0.]
fscore: [1. 0. 0.]
suppor: [23  0  0]
---------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


precis: [1. 0. 0.]
recall: [1. 0. 0.]
fscore: [1. 0. 0.]
suppor: [18  0  0]
---------
precis: [1.         0.         0.33333333]
recall: [0.93181818 0.         1.        ]
fscore: [0.96470588 0.         0.5       ]
suppor: [44  0  1]
---------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


precis: [1. 0. 0.]
recall: [0.9375 0.     0.    ]
fscore: [0.96774194 0.         0.        ]
suppor: [16  0  0]
---------
precis: [0.95833333 0.         0.        ]
recall: [1. 0. 0.]
fscore: [0.9787234 0.        0.       ]
suppor: [23  0  1]
---------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


precis: [0. 0. 0. 0. 0. 0. 0.]
recall: [0. 0. 0. 0. 0. 0. 0.]
fscore: [0. 0. 0. 0. 0. 0. 0.]
suppor: [0 0 0 0 0 0 0]
---------
precis: [0. 0. 0. 0. 0. 0. 0.]
recall: [0. 0. 0. 0. 0. 0. 0.]
fscore: [0. 0. 0. 0. 0. 0. 0.]
suppor: [0 0 0 0 0 0 0]
---------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


precis: [0. 0. 0. 0. 0. 0. 0.]
recall: [0. 0. 0. 0. 0. 0. 0.]
fscore: [0. 0. 0. 0. 0. 0. 0.]
suppor: [0 0 0 0 0 0 0]
---------
precis: [0. 0. 0. 0. 0. 0. 0.]
recall: [0. 0. 0. 0. 0. 0. 0.]
fscore: [0. 0. 0. 0. 0. 0. 0.]
suppor: [0 0 0 0 0 0 0]
---------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


precis: [0. 0. 0. 0. 0. 0. 0.]
recall: [0. 0. 0. 0. 0. 0. 0.]
fscore: [0. 0. 0. 0. 0. 0. 0.]
suppor: [0 0 0 0 0 0 0]
---------
precis: [0. 0. 0. 0. 0. 0. 0.]
recall: [0. 0. 0. 0. 0. 0. 0.]
fscore: [0. 0. 0. 0. 0. 0. 0.]
suppor: [0 0 0 0 0 0 0]
---------
precis: [0.11111111 1.        ]
recall: [1.         0.27272727]
fscore: [0.2        0.42857143]
suppor: [ 2 22]
---------
precis: [0.3 1. ]
recall: [1.   0.65]
fscore: [0.46153846 0.78787879]
suppor: [ 3 20]
---------
precis: [0.85714286 0.75      ]
recall: [0.92307692 0.6       ]
fscore: [0.88888889 0.66666667]
suppor: [13  5]
---------
precis: [0.8 0.8]
recall: [0.76190476 0.83333333]
fscore: [0.7804878  0.81632653]
suppor: [21 24]
---------
precis: [0.6        0.83333333]
recall: [0.85714286 0.55555556]
fscore: [0.70588235 0.66666667]
suppor: [7 9]
---------
precis: [0.14285714 0.9       ]
recall: [0.66666667 0.42857143]
fscore: [0.23529412 0.58064516]
suppor: [ 3 21]
---------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


precis: [0. 1.]
recall: [0.         0.95833333]
fscore: [0.        0.9787234]
suppor: [ 0 24]
---------
precis: [0.33333333 1.        ]
recall: [1.         0.90909091]
fscore: [0.5        0.95238095]
suppor: [ 1 22]
---------
precis: [0.5    0.9375]
recall: [0.5    0.9375]
fscore: [0.5    0.9375]
suppor: [ 2 16]
---------
precis: [1.         0.85714286]
recall: [0.33333333 1.        ]
fscore: [0.5        0.92307692]
suppor: [ 9 36]
---------
precis: [1. 1.]
recall: [1. 1.]
fscore: [1. 1.]
suppor: [ 1 15]
---------
precis: [0.         0.95454545]
recall: [0.         0.91304348]
fscore: [0.         0.93333333]
suppor: [ 1 23]
---------
