In [1]:
import torch
from util import cleaner
from collections import Counter

In [6]:
def data_loader():
    with open(f'data/test_data_2800/questions.txt', 'r') as f:
        questions = f.read().split('\n')
    with open(f'data/test_data_2800/answers.txt', 'r') as f:
        answers = f.read().split('\n')
    with open(f'data/test_data_2800/model_answers.txt', 'r') as f:
        model_answers = f.read().split('\n')
    with open("data/question-to-topic-cleaned.json", "r") as f:
        question_to_topic = eval(f.readline())
    return questions, answers, model_answers, question_to_topic

In [15]:
'''
Calls GPT-3 on an input question.
:param input_question: str the question to pass into GPT-3
:returns: str the response from GPT-3
'''
def evaluate_on_gpt3(input_question):
    return "0"
    raise NotImplementedError()

'''
Determines if `response` and `answer` are equivalent
:param response: str response given by a model
:param answer: float/int the correct answer
:returns: bool whether `response` represents `answer`
'''
def correct(response, answer):
    return eval(response) == answer
    raise NotImplementedError()

In [22]:
def main():
    questions, answers, model_answers, question_to_topic = data_loader()
    answers = [eval(ans) for ans in answers]
    model_answers = [eval(ans) if ans != '' else None for ans in model_answers]
    question_topics = [question_to_topic[cleaner(question)] for question in questions]
    topic_to_score_gpt3 = {}
    topic_to_score_model = {}
    topic_to_freq = Counter(question_topics)
    
    for question,answer,model_answer,topic in zip(questions, answers, model_answers, question_topics):
        response_gpt3 = evaluate_on_gpt3(question)
        if correct(response_gpt3, answer): # determine if response and answer are the same
            topic_to_score_gpt3[topic] = topic_to_score_gpt3.get(topic, 0) + 1
        if model_answer == answer:
            topic_to_score_model[topic] = topic_to_score_model.get(topic, 0) + 1
    
    topic_to_acc_gpt3 = {topic: topic_to_score_gpt3[topic]/topic_to_freq[topic] for topic in topic_to_score_gpt3}
    topic_to_acc_model = {topic: topic_to_score_model[topic]/topic_to_freq[topic] for topic in topic_to_score_model}
    test_score_gpt3 = sum([topic_to_score_gpt3[topic] for topic in topic_to_score_gpt3])/sum([topic_to_freq[topic] for topic in topic_to_score_gpt3])
    topic_score_gpt3 = sum([topic_to_score_gpt3[topic]/(12*topic_to_freq[topic]) for topic in topic_to_score_gpt3])
    test_score_model = sum([topic_to_score_model[topic] for topic in topic_to_score_model])/sum([topic_to_freq[topic] for topic in topic_to_score_model])
    topic_score_model = sum([topic_to_score_model[topic]/(12*topic_to_freq[topic]) for topic in topic_to_score_model])
    print('GPT-3 performance:')
    print(topic_to_acc_gpt3)
    print(f'Score on test data: {test_score_gpt3}')
    print(f'Weighted score across all topics: {topic_score_gpt3}')
    print('\nOur model performance:')
    print(topic_to_acc_model)
    print(f'Score on test data: {test_score_model}')
    print(f'Weighted score across all topics: {topic_score_model}')

In [23]:
main()

GPT-3 performance:
{'p': 0.2777777777777778, 'cnn': 0.019704433497536946, 'sm_mdp': 0.2403846153846154, 'lr': 0.06209150326797386, 'f': 0.03767123287671233, 'r': 0.017605633802816902, 'rnn': 0.03409090909090909, 'b': 0.02127659574468085, 'nn_ii': 0.005319148936170213}
Score on test data: 0.08112652374947457
Weighted score across all topics: 0.05966015419826612

Our model performance:
{'p': 0.9806763285024155, 'dtnn': 1.0, 'rnn': 1.0, 'cnn': 0.8571428571428571, 'lr': 0.8627450980392157, 'f': 0.8561643835616438, 'r': 0.971830985915493, 'nn_i': 1.0, 'nn_ii': 0.9680851063829787, 'b': 1.0, 'sm_mdp': 1.0, 'rl': 1.0}
Score on test data: 0.9310714285714285
Weighted score across all topics: 0.9580537299620504
