In [40]:
import os
import json
import pickle
import numpy as np

In [41]:
autocast_questions = json.load(open('autocast_questions.json')) # from the Autocast dataset
test_questions = json.load(open('autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]

## Create baseline models outputting random answers

In [42]:
# def random_baseline_model(question):
#     if question['qtype'] == 't/f':
#         return np.random.random(size=2)
#     elif question['qtype'] == 'mc':
#         probs = np.random.random(size=len(question['choices']))
#         return probs / probs.sum()
#     elif question['qtype'] == 'num':
#         return np.random.random()


# def calibrated_random_baseline_model(question):
#     if question['qtype'] == 't/f':
#         pred_idx = np.argmax(np.random.random(size=2))
#         pred = np.ones(2)
#         pred[pred_idx] += 1e-5
#         return pred / pred.sum()
#     elif question['qtype'] == 'mc':
#         pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
#         pred = np.ones(len(question['choices']))
#         pred[pred_idx] += 1e-5
#         return pred / pred.sum()
#     elif question['qtype'] == 'num':
#         return 0.5

In [43]:
import os
import json
import pickle
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Set the device to use
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Load the Autocast and competition test set questions
autocast_questions = json.load(open('autocast_questions.json'))
test_questions = json.load(open('autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]

def bert_model(question):
    # Tokenize the question and convert to input features
    question_text = question['question']
    if question['qtype'] == 't/f':
        choices_text = ['true', 'false']
    elif question['qtype'] == 'mc':
        choices_text = question['choices']
    else:
        return 0.5

    inputs = tokenizer(question_text, choices_text, return_tensors='pt', padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Make a forward pass through the BERT model and get the logits
    logits = model(**inputs).logits.squeeze()

    # Convert logits to probabilities using softmax
    probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()

    # Return the probabilities
    return probs.tolist()

def brier_score(probabilities, answer_probabilities):
    if len(probabilities) != len(answer_probabilities):
        return 0.5  # return a neutral score for invalid predictions
    return ((probabilities - answer_probabilities) ** 2).sum() / 2


preds = []
answers = []
qtypes = []

for question in autocast_questions:
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    preds.append(bert_model(question))
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answer'] == 'no' else 1
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answer']) - ord('A')
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)

tf_results, mc_results, num_results = [], [], []

for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

T/F: 29.54, MCQ: 49.55, NUM: 22.63
Combined Metric: 101.71


## Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [44]:
def brier_score(probabilities, answer_probabilities):
    if len(probabilities) != len(answer_probabilities):
        return 0.5  # return a neutral score for invalid predictions
    
    # Calculate Brier score
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

In [45]:
# preds = []
# answers = []
# qtypes = []
# for question in autocast_questions:
#     if question['id'] in test_ids: # skipping questions in the competition test set
#         continue
#     if question['answer'] is None: # skipping questions without answer
#         continue
#     preds.append(calibrated_random_baseline_model(question))
#     if question['qtype'] == 't/f':
#         ans_idx = 0 if question['answer'] == 'no' else 1
#         ans = np.zeros(len(question['choices']))
#         ans[ans_idx] = 1
#         qtypes.append('t/f')
#     elif question['qtype'] == 'mc':
#         ans_idx = ord(question['answer']) - ord('A')
#         ans = np.zeros(len(question['choices']))
#         ans[ans_idx] = 1
#         qtypes.append('mc')
#     elif question['qtype'] == 'num':
#         ans = float(question['answer'])
#         qtypes.append('num')
#     answers.append(ans)

In [46]:
preds = []
answers = []
qtypes = []
for question in autocast_questions:
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    preds.append(bert_model(question))
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answer'] == 'no' else 1
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answer']) - ord('A')
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)

## Evaluate the model

In [47]:
# tf_results, mc_results, num_results = [],[],[]
# for p, a, qtype in zip(preds, answers, qtypes):
#     if qtype == 't/f':
#         tf_results.append(brier_score(p, a))
#     elif qtype == 'mc':
#         mc_results.append(brier_score(p, a))
#     else:
#         num_results.append(np.abs(p - a))

# print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
# print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

In [48]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        if len(p) == 2:
            a = a[:2]
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")


T/F: 29.54, MCQ: 25.45, NUM: 22.63
Combined Metric: 77.61


## Make predictions on test set

In [49]:
# preds = []
# for question in test_questions:
#     preds.append(calibrated_random_baseline_model(question))

In [50]:
preds = []
for question in test_questions:
    preds.append(bert_model(question))

In [51]:
if not os.path.exists('submission'):
    os.makedirs('submission')

with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..

  adding: predictions.pkl (deflated 61%)


In [52]:
!ls

README.md                          predictions.pkl
autocast_competition_test_set.json [34msubmission[m[m
autocast_questions.json            submission.zip
autocast_test_set_w_answers.csv    submission1.ipynb
evaluate.ipynb                     submission2.ipynb
example_submission.ipynb
