In [197]:
import pandas as pd
import numpy as np
import pickle
import os

In [198]:
def brier_score(probabilities, answer_probabilities):
    if len(probabilities) != len(answer_probabilities):
        return 0.5  # return a neutral score for invalid predictions
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

In [199]:
answers_csv = pd.read_csv('autocast_test_set_w_answers.csv')
answers = []
qtypes = []
for question in answers_csv.iterrows():
    question = question[1]
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answers'] == 'no' else 1
        ans = np.zeros(len(eval(question['choices'])))
        ans[ans_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answers']) - ord('A')
        ans = np.zeros(len(eval(question['choices'])))
        ans[ans_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answers'])
        qtypes.append('num')
    answers.append(ans)

In [200]:

with open(os.path.join('submission', 'predictions.pkl'), 'rb') as f:
    preds = pickle.load(f)

In [201]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 26.69, MCQ: 50.00, NUM: 23.28
Combined Metric: 99.97


In [202]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 26.69, MCQ: 50.00, NUM: 23.28
Combined Metric: 99.97
