### Demo using the Question Answering pipeline from huggingface

We use the four multiple choice options as a context. For evaluation we find the closest match of the multiple choices with our generated answer. The heuristic for this is: calculate the LCS (longest common substring) of the prediction and a multiple choice option and divide by the max length of the two. Consider our final answer to be the choice with largest similarity to our prediction. Compare with ground truth.

With this we (sadly) get sad 25.1% accuracy for the astronomy subject, which is 0.1% better than random 

In [None]:
!pip install sacremoses
!pip install transformers

Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 2.8MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=9d004a5b784d4b367611eaafd27be63f9e1b64c0d56c908ac2ba883a0c286e22
  Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45
Successfully built sacremoses
Installing collected packages: sacremoses
Successfully installed sacremoses-0.0.43
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 2.8MB/s 
Collecting 

In [None]:
!tar -xf data.tar

In [None]:
import transformers
from transformers import pipeline
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
question_answerer = pipeline('question-answering')
choices = ["A", "B", "C", "D"]
save_dir = "results"
data_dir = "."

if not os.path.exists(save_dir):
    os.mkdir(save_dir)
#question_answerer.model

In [None]:
def get_info(df, idx):
    question = df.iloc[idx, 0]
    context = ""
    k = df.shape[1] - 2
    for j in range(k):
        context += df.iloc[idx, j+1] +".\n"
    answer_id = choices.index(df.iloc[idx, -1])
    answer = df.iloc[idx, answer_id+1]
    return question, context, answer, answer_id

In [None]:
def lcs(s1, s2):
    """longest common subsequence"""
    matrix = [["" for x in range(len(s2))] for x in range(len(s1))]
    for i in range(len(s1)):
        for j in range(len(s2)):
            if s1[i] == s2[j]:
                if i == 0 or j == 0:
                    matrix[i][j] = s1[i]
                else:
                    matrix[i][j] = matrix[i-1][j-1] + s1[i]
            else:
                matrix[i][j] = max(matrix[i-1][j], matrix[i][j-1], key=len)

    cs = matrix[-1][-1]

    return len(cs), cs

In [None]:
def text_iou(s1, s2):
    """heuristic to measure tex iou"""
    lcs_len, _ = lcs(s1,s2)
    return lcs_len/(max(len(s1), len(s2)))

In [None]:
from difflib import *

def eval(test_df, subject):
    total_correct = 0
    for i in tqdm(range(test_df.shape[0])):
        question, context, gr_truth, gr_truth_id = get_info(test_df, i)
        result = question_answerer({
        'question' : question,
        'context' : context
        })
        # print(f"\n\nQUESTION: {question}\nCONTEXT: \n{context}ANSWER: {gr_truth}\nPREDICTION: {result}")
        answers = []
        scores = []
        for c in context.split('\n'):
            if not c:
                continue
            scores.append(text_iou(c, result['answer']))
        correct = int(np.argmax(np.array(scores)) == gr_truth_id)
        total_correct += correct
    return total_correct



In [None]:

data_dir = "./"
subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(data_dir, "test")) if "_test.csv" in f])

print(subjects)
percentages = []
for subject in subjects:
    test_df = pd.read_csv(os.path.join(data_dir, "test", subject + "_test.csv"), header=None)
    total_correct = eval(test_df, subject)
    print("TOTAL CORRECT: ", total_correct/test_df.shape[0])
    percentages.append(total_correct/test_df.shape[0])
print(percentages)



  0%|          | 0/100 [00:00<?, ?it/s][A[A

['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehis









































































































































































































100%|██████████| 100/100 [01:29<00:00,  1.12it/s]


  0%|          | 0/135 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.18
















































































































































































































































































100%|██████████| 135/135 [02:01<00:00,  1.11it/s]


  0%|          | 0/152 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2518518518518518


















































































































































































































































































































100%|██████████| 152/152 [02:16<00:00,  1.11it/s]


  0%|          | 0/100 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.21710526315789475










































































































































































































100%|██████████| 100/100 [01:32<00:00,  1.08it/s]


  0%|          | 0/265 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.31




















































































































































































































































































































































































































































































































































100%|██████████| 265/265 [03:58<00:00,  1.11it/s]


  0%|          | 0/144 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2


































































































































































































































































































100%|██████████| 144/144 [02:09<00:00,  1.11it/s]


  0%|          | 0/100 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.3333333333333333










































































































































































































100%|██████████| 100/100 [01:29<00:00,  1.11it/s]


  0%|          | 0/100 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.31










































































































































































































100%|██████████| 100/100 [01:30<00:00,  1.11it/s]


  0%|          | 0/100 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.26










































































































































































































100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


  0%|          | 0/173 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.23




























































































































































































































































































































































100%|██████████| 173/173 [02:35<00:00,  1.11it/s]


  0%|          | 0/102 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.18497109826589594














































































































































































































100%|██████████| 102/102 [01:31<00:00,  1.11it/s]


  0%|          | 0/100 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.3235294117647059










































































































































































































100%|██████████| 100/100 [01:29<00:00,  1.11it/s]


  0%|          | 0/235 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.31
























































































































































































































































































































































































































































































100%|██████████| 235/235 [03:33<00:00,  1.10it/s]


  0%|          | 0/114 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2680851063829787






































































































































































































































100%|██████████| 114/114 [01:42<00:00,  1.11it/s]


  0%|          | 0/145 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2719298245614035




































































































































































































































































































100%|██████████| 145/145 [02:09<00:00,  1.12it/s]


  0%|          | 0/378 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2206896551724138






















































































































































































































































































































































































































































































































































































































































































































































































100%|██████████| 378/378 [05:38<00:00,  1.12it/s]


  0%|          | 0/126 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2222222222222222






























































































































































































































































100%|██████████| 126/126 [01:55<00:00,  1.09it/s]


  0%|          | 0/100 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.30952380952380953










































































































































































































100%|██████████| 100/100 [01:29<00:00,  1.12it/s]


  0%|          | 0/310 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.17














































































































































































































































































































































































































































































































































































































































100%|██████████| 310/310 [04:38<00:00,  1.11it/s]


  0%|          | 0/204 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2064516129032258


























































































































































































































































































































































































































100%|██████████| 204/204 [03:05<00:00,  1.10it/s]


  0%|          | 0/100 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.22058823529411764










































































































































































































100%|██████████| 100/100 [01:29<00:00,  1.11it/s]


  0%|          | 0/165 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.23












































































































































































































































































































































100%|██████████| 165/165 [02:28<00:00,  1.11it/s]


  0%|          | 0/202 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.18181818181818182






















































































































































































































































































































































































































100%|██████████| 202/202 [03:01<00:00,  1.11it/s]


  0%|          | 0/193 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.25742574257425743




































































































































































































































































































































































































100%|██████████| 193/193 [02:56<00:00,  1.09it/s]


  0%|          | 0/391 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.24870466321243523
















































































































































































































































































































































































































































































































































































































































































































































































































100%|██████████| 391/391 [05:52<00:00,  1.11it/s]


  0%|          | 0/270 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.24552429667519182






























































































































































































































































































































































































































































































































































100%|██████████| 270/270 [04:01<00:00,  1.12it/s]


  0%|          | 0/238 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2740740740740741






























































































































































































































































































































































































































































































100%|██████████| 238/238 [03:34<00:00,  1.11it/s]


  0%|          | 0/154 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.21008403361344538






















































































































































































































































































































100%|██████████| 154/154 [02:18<00:00,  1.11it/s]


  0%|          | 0/545 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.22077922077922077












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































TOTAL CORRECT:  0.25871559633027524


















































































































































































































































































































































































































































100%|██████████| 216/216 [03:14<00:00,  1.11it/s]


  0%|          | 0/204 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.24537037037037038


























































































































































































































































































































































































































100%|██████████| 204/204 [03:03<00:00,  1.11it/s]


  0%|          | 0/237 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.24509803921568626




























































































































































































































































































































































































































































































100%|██████████| 237/237 [03:33<00:00,  1.11it/s]


  0%|          | 0/223 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.29535864978902954
































































































































































































































































































































































































































































100%|██████████| 223/223 [03:20<00:00,  1.11it/s]


  0%|          | 0/134 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2600896860986547














































































































































































































































































100%|██████████| 134/134 [02:00<00:00,  1.11it/s]


  0%|          | 0/121 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.26119402985074625




















































































































































































































































100%|██████████| 121/121 [01:49<00:00,  1.10it/s]


  0%|          | 0/108 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2066115702479339


























































































































































































































100%|██████████| 108/108 [01:37<00:00,  1.11it/s]


  0%|          | 0/163 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.23148148148148148








































































































































































































































































































































100%|██████████| 163/163 [02:26<00:00,  1.11it/s]


  0%|          | 0/105 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.25766871165644173




















































































































































































































100%|██████████| 105/105 [01:34<00:00,  1.12it/s]


  0%|          | 0/103 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.37142857142857144
















































































































































































































100%|██████████| 103/103 [01:32<00:00,  1.11it/s]


  0%|          | 0/234 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2621359223300971






















































































































































































































































































































































































































































































100%|██████████| 234/234 [03:30<00:00,  1.11it/s]


  0%|          | 0/100 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2905982905982906










































































































































































































100%|██████████| 100/100 [01:29<00:00,  1.12it/s]


  0%|          | 0/791 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.25












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































TOTAL CORRECT:  0.22629582806573956






















































































































































































































































































































































































































































































































































































































































































































100%|██████████| 346/346 [05:11<00:00,  1.11it/s]


  0%|          | 0/895 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.19653179190751446












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































TOTAL CORRECT:  0.2770949720670391






































































































































































































































































































































































































































































































































































































































100%|██████████| 306/306 [04:38<00:00,  1.10it/s]


  0%|          | 0/311 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.25163398692810457
















































































































































































































































































































































































































































































































































































































































100%|██████████| 311/311 [04:40<00:00,  1.11it/s]


  0%|          | 0/324 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.18971061093247588










































































































































































































































































































































































































































































































































































































































































100%|██████████| 324/324 [04:54<00:00,  1.10it/s]


  0%|          | 0/282 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.19444444444444445






















































































































































































































































































































































































































































































































































































100%|██████████| 282/282 [04:14<00:00,  1.11it/s]


  0%|          | 0/1534 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2553191489361702


[1;30;43mStreaming output truncated to the last 5000 lines.[0m









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































TOTAL CORRECT:  0.2242503259452412






















































































































































































































































































































































































































































































































































































100%|██████████| 282/282 [04:19<00:00,  1.09it/s]


  0%|          | 0/624 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.25886524822695034












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































TOTAL CORRECT:  0.22435897435897437


































































































































































































































100%|██████████| 112/112 [01:40<00:00,  1.11it/s]


  0%|          | 0/245 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.20535714285714285












































































































































































































































































































































































































































































































100%|██████████| 245/245 [03:29<00:00,  1.17it/s]


  0%|          | 0/201 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.24081632653061225




















































































































































































































































































































































































































100%|██████████| 201/201 [02:55<00:00,  1.14it/s]


  0%|          | 0/100 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.2537313432835821










































































































































































































100%|██████████| 100/100 [01:27<00:00,  1.15it/s]


  0%|          | 0/166 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.32














































































































































































































































































































































100%|██████████| 166/166 [02:27<00:00,  1.13it/s]


  0%|          | 0/171 [00:00<?, ?it/s][A[A

TOTAL CORRECT:  0.28313253012048195
























































































































































































































































































































































100%|██████████| 171/171 [02:28<00:00,  1.15it/s]

TOTAL CORRECT:  0.22807017543859648
[0.18, 0.2518518518518518, 0.21710526315789475, 0.31, 0.2, 0.3333333333333333, 0.31, 0.26, 0.23, 0.18497109826589594, 0.3235294117647059, 0.31, 0.2680851063829787, 0.2719298245614035, 0.2206896551724138, 0.2222222222222222, 0.30952380952380953, 0.17, 0.2064516129032258, 0.22058823529411764, 0.23, 0.18181818181818182, 0.25742574257425743, 0.24870466321243523, 0.24552429667519182, 0.2740740740740741, 0.21008403361344538, 0.22077922077922077, 0.25871559633027524, 0.24537037037037038, 0.24509803921568626, 0.29535864978902954, 0.2600896860986547, 0.26119402985074625, 0.2066115702479339, 0.23148148148148148, 0.25766871165644173, 0.37142857142857144, 0.2621359223300971, 0.2905982905982906, 0.25, 0.22629582806573956, 0.19653179190751446, 0.2770949720670391, 0.25163398692810457, 0.18971061093247588, 0.19444444444444445, 0.2553191489361702, 0.2242503259452412, 0.25886524822695034, 0.22435897435897437, 0.20535714285714285, 0.24081632653061225, 0.253731343283582




In [None]:
[0.18, 0.2518518518518518, 0.21710526315789475, 0.31, 0.2, 0.3333333333333333, 0.31, 0.26, 0.23, 0.18497109826589594, 0.3235294117647059, 0.31, 0.2680851063829787, 0.2719298245614035, 0.2206896551724138, 0.2222222222222222, 0.30952380952380953, 0.17, 0.2064516129032258, 0.22058823529411764, 0.23, 0.18181818181818182, 0.25742574257425743, 0.24870466321243523, 0.24552429667519182, 0.2740740740740741, 0.21008403361344538, 0.22077922077922077, 0.25871559633027524, 0.24537037037037038, 0.24509803921568626, 0.29535864978902954, 0.2600896860986547, 0.26119402985074625, 0.2066115702479339, 0.23148148148148148, 0.25766871165644173, 0.37142857142857144, 0.2621359223300971, 0.2905982905982906, 0.25, 0.22629582806573956, 0.19653179190751446, 0.2770949720670391, 0.25163398692810457, 0.18971061093247588, 0.19444444444444445, 0.2553191489361702, 0.2242503259452412, 0.25886524822695034, 0.22435897435897437, 0.20535714285714285, 0.24081632653061225, 0.2537313432835821, 0.32, 0.28313253012048195, 0.22807017543859648]


In [None]:
question_answerer({
    'question' : 'Ani believes that her attitudes and behavior play a central role in what happens to her. Such a belief is likely to be associated with',
    'context' : 'a strong superego./n low self-esteem./n low self-efficacy./n an internal locus of control.'
})
# ground truth: an internal locus of control

NameError: ignored