# Intercoder agreement analysis (Section 3.5)

Table 2 in the paper.

In [1]:
import json
import numpy as np
from utils import load_responses, responses_to_acc, responses_to_counts, find_first_integer, accuracy

In [2]:
# Contains task statistics
with open('results/task_statistics.json', 'r') as f:
    task_statistics = json.load(f)

In [3]:
# Responses for Lawma 8B
responses = load_responses({}, ['lawma-8b'], 'results/model_responses/')['lawma-8b']

  0%|          | 0/260 [00:00<?, ?it/s]

100%|██████████| 260/260 [00:01<00:00, 253.36it/s]

lawma-8b has 260 tasks





In [4]:
def responses_to_acc_maj(responses, majority_class):
    y_true = [r['ground_truth'] for r in responses]
    y_pred = [r['model_response'] for r in responses]
    n = len(y_true)

    if type(majority_class) == int:
        majority_class = ' ' + str(majority_class)

    ids_majority = [i for i, true in enumerate(y_true) if true == majority_class]
    y_true = [y_true[i] for i in ids_majority]
    y_pred = [y_pred[i] for i in ids_majority]

    if len(y_true) == 0:
        print(majority_class, responses)

    is_digit = y_true[0]
    if type(is_digit) == list:
        is_digit = is_digit[0]
    is_digit = is_digit.strip().isdigit()
    if is_digit:
        y_pred = [find_first_integer(pred) for pred in y_pred]
    
    return accuracy(y_true, y_pred), len(y_true)

In [5]:
metrics = {}
for statistics in task_statistics:
    task = statistics['task']
    acc = responses_to_acc(responses[task])

    keep_fraction = statistics['downsampled_by']    
    
    _, num_total = responses_to_counts(responses[task])
    acc_maj, num_maj = responses_to_acc_maj(responses[task], statistics['majority_class'])
    
    num_rest = num_total - num_maj
    num_correct = num_total * acc
    num_maj_correct = num_maj * acc_maj
    num_rest_correct = num_correct - num_maj_correct

    # adjust by keep fraction
    num_maj_correct_adj = num_maj_correct/keep_fraction
    num_total_adj = num_maj/keep_fraction + num_rest
    acc_adj = (num_maj_correct_adj + num_rest_correct)/num_total_adj
    
    metrics[task] = {
        'task': task,
        'accuracy': acc,
        'accuracy_majority': acc_maj,  # accuracy, but only if y_true = majority_class
        'num_majority' : num_maj,
        'num_total' : num_total,
        'num_correct' : num_correct,
        'num_maj_correct' : num_maj_correct,
        'accuracy_adjusted' : acc_adj,
        'keep_fraction': keep_fraction,  # 0.3 means that only 0.3 fraction of the majority class samples were used
    }

In [6]:
accs = []
accs_adj = []
print('Task'.ljust(25), 'Acc    AdjAcc Keep')
for task in metrics:
    acc = metrics[task]['accuracy']
    acc_adj = metrics[task]['accuracy_adjusted']
    keep_fraction = metrics[task]['keep_fraction']
    print(task.ljust(25), f'{acc:.4f}', f'{acc_adj:.4f}', f'{keep_fraction:.4f}')
    accs.append(acc)
    accs_adj.append(acc_adj)

Task                      Acc    AdjAcc Keep
songer_concur             0.6437 0.6868 0.0318
songer_post_trl           0.7118 0.8725 0.0298
songer_appel2_1_2         0.6688 0.6920 0.7500
songer_rtcouns            0.9583 0.9719 0.0121
songer_direct2            0.6753 0.6753 1.0000
songer_comment            0.9167 0.9998 0.0013
songer_appel2_3_2         0.5926 0.5926 1.0000
songer_numappel           0.7180 0.8304 0.2728
songer_othcrim            0.8420 0.8442 0.0707
songer_judrev             0.7500 0.8491 0.0044
songer_respond1_7_4       1.0000 1.0000 0.0254
songer_respond2_7_5       0.6875 0.7086 0.7750
songer_method             0.8107 0.8639 0.1212
sc_casedisposition        0.9460 0.9460 1.0000
songer_initiate           0.8210 0.8210 1.0000
songer_respond2_8_2       0.6667 0.6575 0.5357
songer_alj                0.8590 0.8715 0.0102
songer_pretrial           0.7941 0.7996 0.0318
sc_issue_4                0.6341 0.6341 1.0000
songer_standing           0.7969 0.9340 0.0127
songer_appel1_2

In [7]:
print('Accuracy: ', np.mean(accs))
print('Adjusted accuracy: ', np.mean(accs_adj))

Accuracy:  0.8070947926779508
Adjusted accuracy:  0.8449136601192575


In [8]:
print_tasks = {
    'songer_weightev': 'WEIGHTEV',
    'songer_procedur': 'PROCEDUR',
    'songer_origin': 'ORIGIN',
    'songer_direct2': 'DIRECT2',
    'songer_direct1': 'DIRECT1',
    'songer_treat': 'TREAT',
    'songer_geniss': 'GENISS',
    'songer_circuit': 'CIRCUIT',
    'songer_comment': 'COMMENT',
}

ic_agreement = {
    'songer_weightev': 0.76,
    'songer_procedur': 0.78,
    'songer_origin': 0.832,
    'songer_direct2': 0.856,
    'songer_direct1': 0.94,
    'songer_treat': 0.952,
    'songer_geniss': 0.976,
    'songer_circuit': 1.0,
    'songer_comment': 1.0,
}


table_text = '\\begin{table}[h!]\n'
table_text += '\\centering\n'
table_text += '\\begin{tabular}{ccccc}\n'
table_text += '\\toprule\n'
table_text += '\\textbf{Name} & \\textbf{IC Agreement} & \\textbf{Adj accuracy} & (unadjusted) & Keep \\\\\n'
table_text += '\\midrule\n'

print('Task'.ljust(25), 'Acc    Adj Acc Keep')
for task in print_tasks:
    acc = metrics[task]['accuracy']
    acc_adj = metrics[task]['accuracy_adjusted']
    keep_fraction = metrics[task]['keep_fraction']
    print(task.ljust(25), f'{acc:.4f}', f'{acc_adj:.4f}', f'{keep_fraction:.4f}')

    i_agreement = ic_agreement[task] * 100
    # if a float, round to 1 decimal, otherwise int
    if i_agreement % 1 == 0:
        i_agreement = int(i_agreement)
    else:
        i_agreement = round(i_agreement, 1)
    task_code = task.replace("_", "\\_")
    table_text += f'{print_tasks[task]} ({task_code}) & \\textbf{{{i_agreement}}} & \\textbf{{{acc_adj*100:.1f}\\%}} & ({acc*100:.1f}\\%) & {keep_fraction*100:.2f}\\% \\\\ \n'

# add 
table_text += '\\bottomrule\n'
table_text += '\\end{tabular}\n'

Task                      Acc    Adj Acc Keep
songer_weightev           0.7720 0.7872 0.2872
songer_procedur           0.7390 0.7518 0.8308
songer_origin             0.7770 0.8007 0.5313
songer_direct2            0.6753 0.6753 1.0000
songer_direct1            0.8050 0.8050 1.0000
songer_treat              0.9010 0.9109 0.7126
songer_geniss             0.9290 0.9324 0.8477
songer_circuit            0.9320 0.9320 1.0000
songer_comment            0.9167 0.9998 0.0013


In [9]:
print(table_text)

\begin{table}[h!]
\centering
\begin{tabular}{ccccc}
\toprule
\textbf{Name} & \textbf{IC Agreement} & \textbf{Adj accuracy} & (unadjusted) & Keep \\
\midrule
WEIGHTEV (songer\_weightev) & \textbf{76} & \textbf{78.7\%} & (77.2\%) & 28.72\% \\ 
PROCEDUR (songer\_procedur) & \textbf{78} & \textbf{75.2\%} & (73.9\%) & 83.08\% \\ 
ORIGIN (songer\_origin) & \textbf{83.2} & \textbf{80.1\%} & (77.7\%) & 53.13\% \\ 
DIRECT2 (songer\_direct2) & \textbf{85.6} & \textbf{67.5\%} & (67.5\%) & 100.00\% \\ 
DIRECT1 (songer\_direct1) & \textbf{94} & \textbf{80.5\%} & (80.5\%) & 100.00\% \\ 
TREAT (songer\_treat) & \textbf{95.2} & \textbf{91.1\%} & (90.1\%) & 71.26\% \\ 
GENISS (songer\_geniss) & \textbf{97.6} & \textbf{93.2\%} & (92.9\%) & 84.77\% \\ 
CIRCUIT (songer\_circuit) & \textbf{100} & \textbf{93.2\%} & (93.2\%) & 100.00\% \\ 
COMMENT (songer\_comment) & \textbf{100} & \textbf{100.0\%} & (91.7\%) & 0.13\% \\ 
\bottomrule
\end{tabular}

