In [2]:
import os
import json
import pickle
import numpy as np

In [3]:
autocast_questions = json.load(open('autocast_questions.json')) # from the Autocast dataset
test_questions = json.load(open('autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]

## Create baseline models outputting random answers

In [4]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

def bert_model(question):
    inputs = tokenizer(question['stem'], question['choices'], return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=-1)
    return probabilities.detach().numpy()[0]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [5]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

In [7]:
preds = []
answers = []
qtypes = []
for question in autocast_questions:
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    preds.append(calibrated_random_baseline_model(question))
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answer'] == 'no' else 1
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answer']) - ord('A')
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)

KeyError: 'stem'

## Evaluate the model

In [6]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 25.00, MCQ: 38.05, NUM: 22.63
Combined Metric: 85.67


In [None]:
import os
import json
import pickle
import numpy as np
import tensorflow as tf

# Define the neural network model
def build_model(input_shape):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, input_shape=input_shape, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define a function to preprocess the data
def preprocess_data(question):
    if question['qtype'] == 't/f':
        inputs = np.array([len(question['question']), len(question['choices'][0]), len(question['choices'][1])])
        output = 0 if question['answer'] == 'no' else 1
    elif question['qtype'] == 'mc':
        inputs = np.array([len(question['question']), len(question['choices'][0]), len(question['choices'][1]), len(question['choices'][2]), len(question['choices'][3])])
        output = ord(question['answer']) - ord('A')
    elif question['qtype'] == 'num':
        inputs = np.array([len(question['question']), len(question['choices'][0]), len(question['choices'][1])])
        output = float(question['answer'])
    return inputs, output

# Preprocess the data
inputs = []
outputs = []
for question in autocast_questions:
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    input_, output_ = preprocess_data(question)
    inputs.append(input_)
    outputs.append(output_)

inputs = np.array(inputs)
outputs = np.array(outputs)

# Train the model
model = build_model(inputs[0].shape)
model.fit(inputs, outputs, epochs=10, batch_size=32)

## Make predictions on test set

In [7]:
# Make predictions on the test set
preds = []
for question in test_questions:
    input_, _ = preprocess_data(question)
    pred = model.predict(np.array([input_]))[0]
    if question['qtype'] == 't/f':
        preds.append([1-pred, pred])
    elif question['qtype'] == 'mc':
        preds.append(pred)
    else:
        preds.append([pred])

In [8]:
if not os.path.exists('submission'):
    os.makedirs('submission')

with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..

  adding: predictions.pkl (deflated 79%)


In [9]:
!ls

README.md                          example_submission.ipynb
autocast_competition_test_set.json [34msubmission[m[m
autocast_questions.json            submission.zip
