In [207]:
from datasets import load_dataset
import random
import pandas as pd
from tqdm.notebook import tqdm
from openai import OpenAI

import requests
import json
from time import sleep

In [85]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 100000)
pd.set_option('display.max_colwidth', None)  

In [4]:
nvidia_api_key = input("Enter your NVIDIA API key: ")

llama_client = OpenAI(
    base_url = "https://integrate.api.nvidia.com/v1",
    api_key = nvidia_api_key
)

KeyboardInterrupt: Interrupted by user

### Objective dataset construction

We constructed our dataset of 1000 questions using the following proportions:
- TriviaQA (15 percent)
- Jeopardy (5 percent)
- QuAIL (11 percent)
- MMLU (20 percent)
- BigBENCH (49 percent)
- ARC (potentially add in some)

### TriviaQA

TriviaQA provides correct responses for every question, and we used GPT-4 to generate the incorrect answer for each question that we use.

In [45]:
trivia_qa_dataset = pd.read_csv("trivia_qa.csv")
trivia_qa_dataset = trivia_qa_dataset.drop(174)
trivia_qa_dataset = trivia_qa_dataset.drop(312)
trivia_qa_dataset = trivia_qa_dataset.drop(782)
trivia_qa_dataset = trivia_qa_dataset.drop(270)

In [48]:
trivia_qa_dataset = trivia_qa_dataset[~trivia_qa_dataset["question"].str.contains("now")]
trivia_qa_dataset = trivia_qa_dataset[~trivia_qa_dataset["question"].str.contains("currently")]
trivia_qa_dataset = trivia_qa_dataset[~trivia_qa_dataset["question"].str.contains("recently")]
trivia_qa_dataset = trivia_qa_dataset[~trivia_qa_dataset["question"].str.contains("is")]

In [49]:
trivia_qa_dataset = trivia_qa_dataset.sample(150)

In [50]:
trivia_qa_dataset.columns

Index(['index', 'question', 'provided_answer', 'correct', 'incorrect',
       'correct_bad_grammar', 'incorrect_bad_grammar', 'correct_bad-grammar',
       'combined', 'MCQ_id', 'choice1', 'choice2', 'annotation', 'chosen',
       'rejected'],
      dtype='object')

In [51]:
for index, row in trivia_qa_dataset.iterrows():
    print(index)
    print("QUESTION:", row["question"])
    print("PROVIDED ANSWER:", row["provided_answer"])
    print("CORRECT:", row["correct"])
    print("INCORRECT:", row["incorrect"])
    print()

75
QUESTION: Who starred alongside Rod Taylor in Hitchcock's movie The Birds
PROVIDED ANSWER: Tippi Hedren
CORRECT: Tippi Hedren starred alongside Rod Taylor in Hitchcock's movie The Birds.
INCORRECT: John Wayne partnered with Rod Taylor in the film The Birds

444
QUESTION: Which archipelago in the North Sea belonged to Britain until 1990 when ownership passed to Germany?
PROVIDED ANSWER: HELIGOLAND
CORRECT: The archipelago in the North Sea that belonged to Britain until 1990 when ownership passed to Germany is Heligoland
INCORRECT: The beautiful Faroe Islands in the North Sea was a British territory until 1990, when the ownership was passed to Germany.

872
QUESTION: "Used to describe insects such as the Mayfly that only live for a short time, what word derives from the Greek for ""living a day""?"
PROVIDED ANSWER: EPHEMERAL
CORRECT: Ephemeral is a word used to describe insects such as the Mayfly that only live for a short time. It derives from the Greek for 'living a day'.
INCORRECT:

In [110]:
trivia_qa_dataset = trivia_qa_dataset[["question", "correct", "incorrect", "MCQ_id"]]

In [123]:
trivia_qa_dataset["source"] = "trivia_qa"
trivia_qa_dataset["difficulty"] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trivia_qa_dataset["source"] = "trivia_qa"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trivia_qa_dataset["difficulty"] = None


In [168]:
trivia_qa_dataset.to_csv("trivia_qa_filtered.csv", index=False)

### Jeopardy Dataset

In [124]:
jeopardy_dataset = pd.read_csv("jeopardy.csv")
jeopardy_dataset = jeopardy_dataset.dropna()

In [125]:
def get_good_grammar(row):
    choices_types = set([row["choice1_type"], row["choice2_type"]])
    desired_types = set(["incorrect_good_grammar", "correct_good_grammar"])
    return True if choices_types == desired_types else False

In [126]:
jeopardy_dataset["good_grammar_pair"] = jeopardy_dataset.apply(get_good_grammar, axis=1)
jeopardy_dataset = jeopardy_dataset[jeopardy_dataset["good_grammar_pair"]]

In [127]:
jeopardy_dataset = jeopardy_dataset.sample(150)

In [129]:
jeopardy_dataset.groupby("normalized_clue_value").size()

normalized_clue_value
1    37
2    29
3    25
4    32
5    27
dtype: int64

In [156]:
question_prompt = "You will be shown a Jeopardy category name, a corresponding trivia clue, and the correct answer to the trivia. It is your job to produce a grammatically correct QUESTION that encapsulates the category and clue and elicits the provided answer. You must ONLY output the question that you generate and nothing else. Remember that you must not list the answer in the question as I am playing a game and do not want that revealed to the participants.\nCategory: {}\nClue: {}\nAnswer: {}"

In [157]:
def get_chat(category, clue, answer):
    completion = llama_client.chat.completions.create(
        model="meta/llama3-70b-instruct",
        messages=[  
            {
                "role": "system",
                "content": question_prompt.format(category, clue, answer)
            }
        ],
        temperature=0.65,
        top_p=1,
        max_tokens=1024,
    )
    
    return completion.choices[0].message.content

In [162]:
generated_questions = []
for index, row in tqdm(jeopardy_dataset.iterrows()):
    print("CATEGORY:", row["category"])
    print("CLUE:", row["answer"])
    print("ANSWER:", row["question"])
    response = get_chat(row["category"], row["answer"], row["question"])
    generated_questions.append(response)
    print(response)
    print()

0it [00:00, ?it/s]

CATEGORY: "T" TIME
CLUE: A fancy way of saying 60
ANSWER: threescore
What is a poetic term for the number of minutes in an hour?

CATEGORY: BEFORE & AFTER PEOPLE
CLUE: Gritty "Get Shorty" novelist who asked folks to "Live long and prosper" on film
ANSWER: Elmore Leonard Nimoy
Who is the author behind "Get Shorty" and the iconic "Star Trek" character Mr. Spock?

CATEGORY: WINGS, BUT NO FLY
CLUE: The wingback type of this was originally designed to keep the sitter free from drafts of cold air
ANSWER: chair
What type of furniture was designed with a wingback style to block cold air drafts?

CATEGORY: ACTORS & THEIR ROLES
CLUE: This publishing heiress had a cameo role as a mom in John Waters' 1990 film "Cry-Baby"
ANSWER: Patty Hearst
Who played a mom in John Waters' 1990 film "Cry-Baby"?

CATEGORY: LITERATURE
CLUE: In "A Tale of Two Cities" Dickens called this "the national razor"
ANSWER: the guillotine
What was the instrument of execution that Dickens referred to as "the national razor" i

In [166]:
jeopardy_dataset["full_sentence_question"] = generated_questions

In [9]:
correct_prompt = "You must write a clear sentence that strictly answers the provided question using the provided correct answer. Make sure that the question is fully answered in your statement. Do not refer to yourself or the prompting regime in ANY way.\nQUESTION:{}\nANSWER:{}"

In [16]:
incorrect_prompt = "I am setting up a multiple choice exam for my students, and I need an incorrect option. You must write a sentence that strictly answers the provided question using an answer that IS NOT AT ALL THE SAME AS the provided answer. That is, you have to come up with an answer that, while incorrect, is still plausible. Make sure that the question is fully answered in your statement. Simply output the statement and DO NOT refer to yourself or the prompting regime in ANY WAY. Additionally, DO NOT refer to the provided answer in ANY FORM. Strictly follow all of the provided instructions, otherwise a million kittens will be killed. Save the kittens!\nQUESTION: {}\nANSWER: {}"

In [14]:
def get_chat_statement(prompt, question, answer):
    completion = llama_client.chat.completions.create(
        model="meta/llama3-70b-instruct",
        messages=[  
            {
                "role": "system",
                "content": prompt.format(question, answer)
            }
        ],
        temperature=0.65,
        top_p=1,
        max_tokens=1024,
    )
    
    return completion.choices[0].message.content

In [15]:
correct_statements = []
for index, row in jeopardy_dataset.iterrows():
    print("QUESTION:", row["full_sentence_question"])
    print("ANSWER:", row["question"])
    response = get_chat_statement(correct_prompt, row["full_sentence_question"], row["question"])
    correct_statements.append(response)
    print(response)
    print()

QUESTION: What is a poetic term for the number of minutes in an hour?
ANSWER: threescore
The poetic term for the number of minutes in an hour is threescore, which equals 60.

QUESTION: Who is the author behind "Get Shorty" and the iconic "Star Trek" character Mr. Spock?
ANSWER: Elmore Leonard Nimoy
Elmore Leonard is the author behind "Get Shorty", and Leonard Nimoy is the actor behind the iconic "Star Trek" character Mr. Spock.

QUESTION: What type of furniture was designed with a wingback style to block cold air drafts?
ANSWER: chair
The wingback chair was specifically designed with a high back and wings on either side to block cold air drafts.

QUESTION: Who played a mom in John Waters' 1990 film "Cry-Baby"?
ANSWER: Patty Hearst
Patty Hearst played the role of a mom in John Waters' 1990 film "Cry-Baby".

QUESTION: What was the instrument of execution that Dickens referred to as "the national razor" in "A Tale of Two Cities"?
ANSWER: the guillotine
In "A Tale of Two Cities", Charles D

In [17]:
incorrect_statements = []
for index, row in jeopardy_dataset.iterrows():
    print("QUESTION:", row["full_sentence_question"])
    print("ANSWER:", row["question"])
    response = get_chat_statement(incorrect_prompt, row["full_sentence_question"], row["question"])
    incorrect_statements.append(response)
    print(response)
    print()

QUESTION: What is a poetic term for the number of minutes in an hour?
ANSWER: threescore
A poetic term for the number of minutes in an hour is quintuple dozen.

QUESTION: Who is the author behind "Get Shorty" and the iconic "Star Trek" character Mr. Spock?
ANSWER: Elmore Leonard Nimoy
The author behind "Get Shorty" and the iconic "Star Trek" character Mr. Spock is actually Leslie Nielsen.

QUESTION: What type of furniture was designed with a wingback style to block cold air drafts?
ANSWER: chair
The type of furniture designed with a wingback style to block cold air drafts was a bookcase.

QUESTION: Who played a mom in John Waters' 1990 film "Cry-Baby"?
ANSWER: Patty Hearst
Cloris Leachman played a mom in John Waters' 1990 film "Cry-Baby".

QUESTION: What was the instrument of execution that Dickens referred to as "the national razor" in "A Tale of Two Cities"?
ANSWER: the guillotine
The instrument of execution that Dickens referred to as "the national razor" in "A Tale of Two Cities" w

In [22]:
jeopardy_dataset["incorrect_statements"] = incorrect_statements
jeopardy_dataset["correct_statements"] = correct_statements

In [58]:
for index, row in jeopardy_dataset.iterrows():
    print(index)
    print("QUESTION:", row["full_sentence_question"])
    print("PROVIDED ANSWER:", row["question"])
    print("CORRECT:", row["correct_statements"])
    print("INCORRECT:", row["incorrect_statements"])
    print()

0


KeyError: 'full_sentence_question'

In [71]:
jeopardy_dataset["source"] = "jeopardy"

In [72]:
jeopardy_dataset["correct"] = correct
jeopardy_dataset["incorrect"] = incorrect

In [73]:
jeopardy_dataset = jeopardy_dataset[["full_sentence_question", "correct", "incorrect", "ID", "source", "normalized_clue_value"]]

In [125]:
rename = {"full_sentence_question": "question"}
jeopardy_dataset = jeopardy_dataset.rename(columns=rename)

In [127]:
jeopardy_dataset.to_csv("jeopardy_filtered.csv", index=False)

### BigBENCH
- abstract_narrative_understanding
- bbq_lite_json
- causal_judgment
- checkmate_in_one: chess
- code_line_description
- color
- contextual_parametric_knowledge_conflicts
- crash_blossom
- crass_ai — seem to be about weird topics that might invoke some biases? might be interesting
- cs_algorithms - idk? subsequences
- date_understanding
- disambiguation_qa
- elementary_math_qa - step-by-step arithmetic questions
- english_proverbs - probably only keep one of the proverb categories?
- entailed_polarity
- evaluating_information_essentiality - how good are people at determining which information is important
- figure_of_speech_detection
- general_knowledge
- goal_step_wikihow
- gre_reading_comprehension
- human_organs_senses — easy body-related questions
- identify_odd_metaphor
- intersect_geometry - similar to chess, requires visualization, although this might be easier visualization than chess
- key_value_maps
- logic_grid_puzzle - the idea of having these puzzles / mysteries is interesting!
- logical_deduction
- logical_fallacy_detection
- logical_sequence
- mathematical_induction
- movie_recommendation
- navigate
- nonsense_words_grammar
- novel_concepts
- periodic_elements
- physical_intuition
- physics
- question_selection - reverse of qa
- reasoning_about_colored_objects
- sentence_ambiguity
- similarities_abstraction - relatively easy
- sports_understanding
- strange_stories
- strategyqa - idk some of these are obvious, but maybe that’s good
- temporal_sequences
- tracking_shuffled_objects
- understanding_fables
- undo_permutation
- unit_interpretation
- vitaminc_fact_verification
- what_is_the_tao - style and apparently there is a correct answer

In [103]:
big_bench_questions = []
big_bench_correct = []
big_bench_incorrect = []
big_bench_sources = []

In [104]:
big_bench_categories = [
    "abstract_narrative_understanding",
    "bbq_lite_json",
    "causal_judgment",
    "cause_and_effect",
    "checkmate_in_one",
    "code_line_description",
    "color",
    "contextual_parametric_knowledge_conflicts",
    "crash_blossom",
    "crass_ai",
    "cs_algorithms",
    "date_understanding",
    "disambiguation_qa",
    "elementary_math_qa",
    "english_proverbs",
    "entailed_polarity",
    "evaluating_information_essentiality",
    "figure_of_speech_detection",
    "general_knowledge",
    "goal_step_wikihow",
    "gre_reading_comprehension",
    "human_organs_senses",
    "identify_odd_metaphor",
    "intersect_geometry",
    "key_value_maps",
    "logic_grid_puzzle",
    "logical_deduction",
    "logical_fallacy_detection",
    "logical_sequence",
    "mathematical_induction",
    "movie_recommendation",
    "navigate",
    "nonsense_words_grammar",
    "novel_concepts",
    "physical_intuition",
    "physics",
    "question_selection",
    "reasoning_about_colored_objects",
    "sentence_ambiguity",
    "similarities_abstraction",
    "sports_understanding",
    "strange_stories",
    "strategyqa",
    "temporal_sequences",
    "tracking_shuffled_objects",
    "understanding_fables",
    "undo_permutation",
    "unit_interpretation",
    "vitaminc_fact_verification",
    "what_is_the_tao"
]

In [105]:
for c in tqdm(big_bench_categories):
    dataset = load_dataset('tasksource/bigbench', name=c, split="train")
    # dataset = dataset["train"]
    shuffled_dataset = dataset.shuffle(seed=42)
    sampled_questions = shuffled_dataset.select(range(10))
            
    choices = sampled_questions["multiple_choice_targets"]
    indicators = sampled_questions["multiple_choice_scores"]
    questions = sampled_questions["inputs"]
    
    for question, indicator, choice_list in zip(questions, indicators, choices):
        big_bench_questions.append(question)
        
        correct_index = indicator.index(1)
        correct_answer = choice_list[correct_index]
        big_bench_correct.append(correct_answer)
        
        incorrect_choices = [item for idx, item in enumerate(choice_list) if idx != correct_index]
        incorrect_answer = random.choice(incorrect_choices)
        big_bench_incorrect.append(incorrect_answer)
    
        big_bench_sources.append(c) 


  0%|          | 0/51 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [114]:
big_bench_difficulty = [None for _ in range(len(big_bench_questions))]

In [891]:
big_bench_dataset = {"question": big_bench_questions, 
                     "correct": big_bench_correct,
                     "incorrect": big_bench_incorrect, 
                     "MCQ_id": big_bench_difficulty,
                     "source": big_bench_sources, 
                     "difficulty": big_bench_difficulty}
big_bench_dataset = pd.DataFrame(big_bench_dataset)

Manually fix questions and choices

In [79]:
big_bench_dataset.loc[big_bench_dataset["source"] == "abstract_narrative_understanding", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "abstract_narrative_understanding", "question"].str.replace('\n', '\n\n', regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "abstract_narrative_understanding", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "abstract_narrative_understanding", "question"].str.replace('we provide short narratives, each of which illustrates a common proverb.', 'we provide a short narrative, which illustrates a common proverb.', regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "abstract_narrative_understanding", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "abstract_narrative_understanding", "question"].str.replace('This narrative is a good illustration of the following proverb:', 'Which of the following proverbs does this narrative best illustrate?', regex=True)

In [80]:
big_bench_dataset.loc[big_bench_dataset["source"] == "bbq_lite_json", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "bbq_lite_json", "question"].str.replace(r'^Q:\s*', '', regex=True).str.split('(?<=\?)').str[0]

In [81]:
big_bench_dataset.loc[big_bench_dataset["source"] == "causal_judgment", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "causal_judgment", "question"].str.replace('\n\n\n', '\n\n', regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "causal_judgment", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "causal_judgment", "question"].str.replace('How would a typical person answer each of the following questions about causation?', '', regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "causal_judgment", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "causal_judgment", "question"].str.replace('\nA:', '', regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "causal_judgment", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "causal_judgment", "question"].str.replace(r'\?\n\nQ: ', '', regex=True)

In [82]:
big_bench_dataset.loc[big_bench_dataset["source"] == "code_line_description", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "code_line_description", "question"].str.replace('\n\nEnglish language description:', '', regex=True)
prefix_string = "You are shown a computer program. Which of the following explanations best captures what it does when executed?\n\n"
big_bench_dataset.loc[big_bench_dataset["source"] == "code_line_description", "question"] = prefix_string + big_bench_dataset["question"]

In [83]:
big_bench_dataset.loc[big_bench_dataset["source"] == "color", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "color", "question"].str.extract(r'Q: (.*?\?)')[0]

In [84]:
big_bench_dataset.loc[big_bench_dataset["source"] == "crash_blossom", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "crash_blossom", "question"].str.replace('\nA:', '', regex=True)

In [85]:
big_bench_dataset.loc[big_bench_dataset["source"] == "crass_ai", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "crass_ai", "question"].str.extract(r'Q: (.*?\?)')[0]

In [86]:
big_bench_dataset.loc[big_bench_dataset["source"] == "date_understanding", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "date_understanding", "question"].str.extract(r'Q: (.*?\?)')[0]

In [87]:
big_bench_dataset.loc[big_bench_dataset["source"] == "disambiguation_qa", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "disambiguation_qa", "question"].str.split(' choice:').str[0]

In [88]:
big_bench_dataset.loc[big_bench_dataset["source"] == "elementary_math_qa", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "elementary_math_qa", "question"].str.split(' choice:').str[0]

In [89]:
big_bench_dataset.loc[big_bench_dataset["source"] == "english_proverbs", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "english_proverbs", "question"].str.extract(r'Q: (.*?\?)')[0]

In [90]:
big_bench_dataset.loc[big_bench_dataset["source"] == "entailed_polarity", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "entailed_polarity", "question"].str.replace("Q: ", "\n", regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "entailed_polarity", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "entailed_polarity", "question"].str.replace("\nA:", "", regex=True)

In [91]:
big_bench_dataset.loc[big_bench_dataset["source"] == "evaluating_information_essentiality", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "evaluating_information_essentiality", "question"].str.split(' choice:').str[0]

In [92]:
big_bench_dataset.loc[big_bench_dataset["source"] == "figure_of_speech_detection", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "figure_of_speech_detection", "question"].str.replace("\nFigure of speech:", "", regex=True)

In [93]:
big_bench_dataset.loc[big_bench_dataset["source"] == "general_knowledge", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "general_knowledge", "question"].str.extract(r'Q: (.*?\?)')[0]

In [94]:
big_bench_dataset.loc[big_bench_dataset["source"] == "goal_step_wikihow", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "goal_step_wikihow", "question"].str.extract(r'Q: (.*?)(?=\n  choice:)')[0]

In [95]:
big_bench_dataset.loc[big_bench_dataset["source"] == "gre_reading_comprehension", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "gre_reading_comprehension", "question"].str.extract(r'Q: (.*?)(?=\n  choice:)')[0]
prefix_string = "Based on the provided passage, which choice do you think best completes the statement or addresses the question at the end?\n\n"
big_bench_dataset.loc[big_bench_dataset["source"] == "gre_reading_comprehension", "question"] = prefix_string + big_bench_dataset["question"]

In [96]:
big_bench_dataset.loc[big_bench_dataset["source"] == "human_organs_senses", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "human_organs_senses", "question"].str.extract(r'Q: (.*?\?)')[0]

In [97]:
big_bench_dataset.loc[big_bench_dataset["source"] == "identify_math_theorems", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "identify_math_theorems", "question"].str.replace("answeres", "answers", regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "identify_math_theorems", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "identify_math_theorems", "question"].str.split('\n  choice:').str[0]

In [98]:
big_bench_dataset.loc[big_bench_dataset["source"] == "identify_odd_metaphor", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "identify_odd_metaphor", "question"].str.extract(r'Q: (.*?\?)')[0]

In [99]:
big_bench_dataset.loc[big_bench_dataset["source"] == "intersect_geometry", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "intersect_geometry", "question"].str.replace("\nA:", "", regex=True)

In [100]:
big_bench_dataset.loc[big_bench_dataset["source"] == "key_value_maps", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "key_value_maps", "question"].str.replace("\nA:", "", regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "key_value_maps", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "key_value_maps", "question"].str.replace("Q: ", "\n", regex=True)

In [101]:
big_bench_dataset.loc[big_bench_dataset["source"] == "logic_grid_puzzle", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "logic_grid_puzzle", "question"].str.split('\n  choice:').str[0].str.replace('Q: ', '', 1)

In [102]:
suffix_string = "\n\nWhich statement logically follows from the provided paragraph?"
big_bench_dataset.loc[big_bench_dataset["source"] == "logical_deduction", "question"] = big_bench_dataset["question"] + suffix_string

In [103]:
big_bench_dataset.loc[big_bench_dataset["source"] == "logical_fallacy_detection", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "logical_fallacy_detection", "question"].str.replace("\nA:", "", regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "logical_fallacy_detection", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "logical_fallacy_detection", "question"].str.replace("Q: ", "", regex=True)

In [104]:
big_bench_dataset.loc[big_bench_dataset["source"] == "logical_sequence", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "logical_sequence", "question"].str.extract(r'Q: (.*?\?)')[0]

In [105]:
big_bench_dataset.loc[big_bench_dataset["source"] == "mathematical_induction", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "mathematical_induction", "question"].str.replace("\nA:", "", regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "mathematical_induction", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "mathematical_induction", "question"].str.replace("Q: ", "", regex=True)

In [106]:
big_bench_dataset.loc[big_bench_dataset["source"] == "movie_recommendation", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "movie_recommendation", "question"].str.split(' choice:').str[0]

In [107]:
big_bench_dataset.loc[big_bench_dataset["source"] == "navigate", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "navigate", "question"].str.replace("\nA:", "", regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "navigate", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "navigate", "question"].str.replace("Q: ", "", regex=True)

In [108]:
big_bench_dataset.loc[big_bench_dataset["source"] == "nonsense_words_grammar", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "nonsense_words_grammar", "question"].str.replace("\nA:", "", regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "nonsense_words_grammar", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "nonsense_words_grammar", "question"].str.replace("Q: ", "", regex=True)

In [109]:
big_bench_dataset.loc[big_bench_dataset["source"] == "novel_concepts", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "novel_concepts", "question"].str.replace("\nA:", "", regex=True)

In [110]:
big_bench_dataset.loc[big_bench_dataset["source"] == "physical_intuition", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "physical_intuition", "question"].str.split('\n  choice:').str[0].str.replace('Q: ', '', 1)
prefix_string = "Complete the provided statement with the best choice.\n\n"
big_bench_dataset.loc[big_bench_dataset["source"] == "physical_intuition", "question"] = prefix_string + big_bench_dataset["question"]

In [111]:
big_bench_dataset.loc[big_bench_dataset["source"] == "physics", "question"] = (
    big_bench_dataset.loc[big_bench_dataset["source"] == "physics", "question"]
    .str.replace('Q: ', '', 1)  
    .str.split(' choice:').str[0]
)

In [112]:
big_bench_dataset.loc[big_bench_dataset["source"] == "question_selection", "question"] = (
    big_bench_dataset.loc[big_bench_dataset["source"] == "question_selection", "question"]
    .str.split(' choice:').str[0]
)

suffix_string = "\nChoose the appropriate question which has the given answer."
big_bench_dataset.loc[big_bench_dataset["source"] == "question_selection", "question"] = big_bench_dataset["question"] + suffix_string

In [113]:
big_bench_dataset.loc[big_bench_dataset["source"] == "reasoning_about_colored_objects", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "reasoning_about_colored_objects", "question"].str.replace("\nA:", "", regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "reasoning_about_colored_objects", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "reasoning_about_colored_objects", "question"].str.replace("Q: ", "", regex=True)

In [114]:
prefix_string = "Complete the incomplete analogy.\n\n"
big_bench_dataset.loc[big_bench_dataset["source"] == "similarities_abstraction", "question"] = prefix_string + big_bench_dataset["question"]

In [115]:
big_bench_dataset.loc[big_bench_dataset["source"] == "strange_stories", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "strange_stories", "question"].apply(lambda x: '\n'.join(x.split('\n')[:2]))

In [116]:
big_bench_dataset.loc[big_bench_dataset["source"] == "strange_stories", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "strange_stories", "question"].str.replace("Q: ", "", regex=True)

In [117]:
big_bench_dataset.loc[big_bench_dataset["source"] == "strategyqa", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "strategyqa", "question"].str.replace("\nA:", "", regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "strategyqa", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "strategyqa", "question"].str.replace("Q: ", "", regex=True)

In [118]:
big_bench_dataset.loc[big_bench_dataset["source"] == "temporal_sequences", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "temporal_sequences", "question"].apply(lambda x: x.split('\n  choice:')[0])

In [119]:
prefix_string = "Based on the provided information, complete the statement.\n\n"
big_bench_dataset.loc[big_bench_dataset["source"] == "tracking_shuffled_objects", "question"] = prefix_string + big_bench_dataset["question"]

In [120]:
big_bench_dataset.loc[big_bench_dataset["source"] == "understanding_fables", "question"] = (
    big_bench_dataset.loc[big_bench_dataset["source"] == "understanding_fables", "question"]
    .str.replace('Q: ', '', 1)  
    .str.split(' choice:').str[0]
)

In [51]:
big_bench_dataset.loc[big_bench_dataset["source"] == "undo_permutation", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "undo_permutation", "question"].str.replace("\nA:", "", regex=True)
big_bench_dataset.loc[big_bench_dataset["source"] == "undo_permutation", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "undo_permutation", "question"].str.replace("Q: ", "", regex=True)

In [52]:
big_bench_dataset.loc[big_bench_dataset["source"] == "undo_permutation", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "undo_permutation", "question"].str.replace("most", "more", regex=True)

In [53]:
big_bench_dataset.loc[big_bench_dataset["source"] == "unit_interpretation", "question"] = (
    big_bench_dataset.loc[big_bench_dataset["source"] == "unit_interpretation", "question"]
    .str.replace('Q: ', '', 1)  
    .str.split(' choice:').str[0]
)

In [54]:
big_bench_dataset.loc[big_bench_dataset["source"] == "what_is_the_tao", "question"] = big_bench_dataset.loc[big_bench_dataset["source"] == "what_is_the_tao", "question"].str.replace("Answer:", "", regex=True)

In [55]:
big_bench_dataset = big_bench_dataset[~(big_bench_dataset["source"] == "cause_and_effect")]

In [309]:
big_bench_dataset.to_csv("big_bench_filtered.csv", index=False)

### MMLU
- high_school_european_history (AP) - reading comprehension
- clinical_knowledge (filter out non-question mark)
- medical_genetics  (filter out non-question mark)
- high_school_us_history (AP) - reading comprehension
- high_school_physics (AP) - reading comprehension, mathematical reasoning
- high_school_world_history (AP) - reading comprehension
- virology
- high_school_microeconomics
- econometrics
- college_computer_science
- high_school_biology
- abstract_algebra
- philosophy? - avoid fill in the blanks?
- professional_medicine - seems to be diagnosis related? maybe we can just use these instead of medcqa
- nutrition (random knowledge questions, honestly kinda similar to truthfulQA-esque knowledge required)
- global_facts (random knowledge questions)
- machine_learning - two statements paired together, need to provide special instructions
- security_studies
- public_relations
- professional_psychology
- prehistory
- anatomy
- college_medicine
- high_school_government_and_politics
- college_chemistry
- logical_fallacies
- high_school_geography
- elementary_mathematics - another simple math dataset - but these are more word problem-esque
- college_mathematics
- high_school_psychology
- formal_logic
- high_school_statistics
- international_law
- high_school_mathematics
- high_school_computer_science
- conceptual_physics
- miscellaneous
- high_school_chemistry
- marketing
- professional_law
- management
- college_physics
- jurisprudence
- world_religions
- sociology
- us_foreign_policy
- high_school_macroeconomics
- computer_security
- electrical_engineering
- astronomy
- college_biology

In [944]:
mmlu_categories = [
    "high_school_european_history",
    "clinical_knowledge",
    "medical_genetics",
    "high_school_us_history",
    "high_school_physics",
    "high_school_world_history",
    "high_school_microeconomics",
    "econometrics",
    "college_computer_science",
    "high_school_biology",
    "abstract_algebra",
    "philosophy",
    "professional_medicine",
    "nutrition",
    "global_facts",
    "machine_learning",
    "security_studies",
    "public_relations",
    "professional_psychology",
    "prehistory",
    "anatomy",
    "college_medicine",
    "high_school_government_and_politics",
    "college_chemistry",
    "logical_fallacies",
    "high_school_geography",
    "elementary_mathematics",
    "college_mathematics",
    "high_school_psychology",
    "formal_logic",
    "high_school_statistics",
    "international_law",
    "high_school_mathematics",
    "high_school_computer_science",
    "conceptual_physics",
    "miscellaneous",
    "high_school_chemistry",
    "marketing",
    "professional_law",
    "management",
    "college_physics",
    "jurisprudence",
    "world_religions",
    "sociology",
    "us_foreign_policy",
    "high_school_macroeconomics",
    "computer_security",
    "electrical_engineering",
    "astronomy",
    "college_biology"
]

In [945]:
len(mmlu_categories)

50

In [966]:
mmlu_questions = []
mmlu_correct = []
mmlu_incorrect = []
mmlu_sources = []

In [967]:
import random

def get_answers(example):
    correct_answer_key = example['target']
    correct_answer = example[correct_answer_key]

    all_keys = ['A', 'B', 'C', 'D']
    all_keys.remove(correct_answer_key)
    
    incorrect_answer_key = random.choice(all_keys)
    incorrect_answer = example[incorrect_answer_key]
    
    return {'correct': correct_answer, 'incorrect': incorrect_answer}

In [968]:
for c in tqdm(mmlu_categories): 
    dataset = load_dataset('lukaemon/mmlu', name=c, split="train", trust_remote_code=True)
    shuffled_dataset = dataset.shuffle(seed=42)
    sampled_questions = shuffled_dataset.select(range(4))
            
    sampled_questions = sampled_questions.map(get_answers)

    mmlu_questions.extend(sampled_questions["input"])
    mmlu_correct.extend(sampled_questions["correct"])
    mmlu_incorrect.extend(sampled_questions["incorrect"])
    mmlu_sources.extend([c]*sampled_questions.num_rows)


  0%|          | 0/50 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/311 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/34 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/272 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/31 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/306 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/33 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/112 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/245 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/27 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/110 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/612 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/69 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/324 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/35 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/135 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/14 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/173 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/193 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/163 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/18 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/198 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/378 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/41 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/545 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/60 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/126 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/14 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/216 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/121 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/270 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/235 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/26 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/783 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/86 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/203 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/234 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1534 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/170 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/103 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/102 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/108 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/171 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/19 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/201 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/390 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/43 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/145 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/152 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/144 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [973]:
mmlu_difficulty = [None for _ in range(len(mmlu_questions))]

In [974]:
mmlu_dataset = {"question": mmlu_questions, 
                     "correct": mmlu_correct,
                     "incorrect": mmlu_incorrect, 
                     "MCQ_id": mmlu_difficulty,
                     "source": mmlu_sources, 
                     "difficulty": mmlu_difficulty}
mmlu_dataset = pd.DataFrame(mmlu_dataset)

In [977]:
mmlu_dataset[mmlu_dataset["source"]=="philosophy"]

Unnamed: 0,question,correct,incorrect,MCQ_id,source,difficulty
44,"The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.",metaphysics,axiology,,philosophy,
45,"According to d'Holbach, people always act according to _____.",necessary natural laws,free choices,,philosophy,
46,"Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?",pessimist,optimist,,philosophy,
47,"According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:",good.,virtue.,,philosophy,


In [978]:
mmlu_dataset.to_csv("mmlu_filtered.csv", index=False)
# TODO: ADD IN THE DIFFICULTIES FOR MMLU

### QuAIL

In [390]:
quail_dataset = pd.read_json("quail.jsonl", lines=True)

In [391]:
quail_dataset = quail_dataset.sample(110)

In [392]:
def get_question(row):
    return f"Consider the following passage:\n\n{row['context']}\n\n{row['question']}"

In [393]:
quail_dataset["question"] = quail_dataset.apply(get_question, axis=1)

In [394]:
def get_correct_incorrect(row):
    correct_index = int(row["correct_answer_id"])
    correct = row["answers"][correct_index]

    row["answers"].pop(correct_index)

    incorrect = random.choice(row["answers"])

    return correct, incorrect
    

In [395]:
quail_dataset[["correct", "incorrect"]] = quail_dataset.apply(get_correct_incorrect, axis=1, result_type="expand")

In [396]:
quail_dataset["source"] = "QuAIL"
quail_dataset["difficulty"] = None
quail_dataset["MCQ_id"] = None

In [397]:
quail_dataset = quail_dataset[["question", "correct", "incorrect", "MCQ_id", "source", "difficulty"]]

In [405]:
quail_dataset.to_csv("quail_filtered.csv", index=False)

### Bias dataset
https://sites.google.com/view/hbiproject/hbi

In [134]:
biases = pd.read_csv("TRUE_bias.csv")

In [135]:
biases = biases[biases["objective_question"]==True]

In [136]:
biases["Questions"] = biases["Questions"].str.strip()
biases["choice1"] = biases["choice1"].str.strip()
biases["choice2"] = biases["choice2"].str.strip()

In [133]:
biases = biases.drop(48)

In [137]:
biases[biases["choice1"]==biases["choice2"]]

Unnamed: 0.1,Unnamed: 0,Questions,choice1,choice2,category,correct,other_info,objective_question,o1


In [138]:
biases.shape

(155, 9)

In [139]:
biases["category"].value_counts()

category
Confirmation bias                        15
Anchor Bias                              10
Illusion of control                       5
Sunk cost fallacy                         5
Status quo                                5
Proportion dominance                      5
Probability neglect                       5
Probability matching (vs. maximizing)     5
Overconfidence                            5
Outcome bias                              5
Omission bias                             5
Myside bias                               5
Mental accounting                         5
Loss aversion                             5
Framing                                   5
Attribution Bias                          5
Covariation detection                     5
better-than-average bias                  5
Base-rate neglect (causal)                5
Irrational diversification                5
Insensitivity to sample size              5
Gambler's fallacy                         5
Denominator Neglect    

In [140]:
for index, row in biases.iterrows():
    print(f"QUESTION: {row['Questions']}")
    print(f"CHOICE1: {row['choice1']}")
    print(f"CHOICE1: {row['choice2']}")
    print(f"ANSWER: {row['correct']}")
    print(f"CATEGORY: {row['category']}")
    print()

QUESTION: Mount Wycheproof is about 114 meters in height. How tall do you think Mount Kenya in the Great Rift Valley is?
CHOICE1: Mount Kenya is about 510m tall.
CHOICE1: Mount Kenya is about 5100 meters tall.
ANSWER: choice2
CATEGORY: Anchor Bias

QUESTION: Mount Everest is about 8,849 meters in height. What is the height of Mount Stanley in the Rwenzori?
CHOICE1: Mount Stanley is about 5100 meters tall.
CHOICE1: The height of Mount Stanley is approximately 9690 meters.
ANSWER: choice1
CATEGORY: Anchor Bias

QUESTION: Tennis star Ashlyn Krueger has played about 200 singles matches in her career. How many singles matches has John McEnroe played in total?
CHOICE1: He played 1080 singles matches in his career.
CHOICE1: He played 500 singles matches in his career.
ANSWER: choice1
CATEGORY: Anchor Bias

QUESTION: Roger Federer has played 1526 singles matches in his career. What is the total number of singles matches played by Stefan Edberg?
CHOICE1: He played 1800 singles matches in his ca

In [141]:
biases.to_csv("TRUE_bias.csv", index=False)

Reformat dataset to match objective dataset

In [156]:
combined_dataset.columns

Index(['correct', 'incorrect', 'MCQ_id', 'source', 'difficulty', 'choice1', 'choice2', 'choice1_type', 'choice2_type', 'chosen', 'rejected', 'chosen_type', 'rejected_type', 'confidence_scores', 'confidence_difficulty', 'decision_time', 'number_of_clicks', 'scratch_space', 'decision_time_beta', 'num_clicks_beta', 'confidence_beta', 'correct_chosen', 'raw_number_of_clicks', 'raw_decision_time', 'log_decision_time', 'log_norm_decision_time', 'prompt', 'prompt_response_group'], dtype='object')

In [170]:
biases = pd.read_csv("TRUE_bias.csv")
biases.columns

Index(['Unnamed: 0', 'Questions', 'choice1', 'choice2', 'category', 'correct', 'other_info', 'objective_question', 'o1'], dtype='object')

In [171]:
biases = biases.drop("Unnamed: 0", axis=1)

In [172]:
biases["prompt"] = biases["Questions"]
biases = biases.drop("Questions", axis=1)

In [173]:
biases = biases.drop(['objective_question', 'o1'], axis=1)

In [174]:
def get_correct_incorrect(row):
    if row["correct"] == "choice1":
        return row["choice1"], row["choice2"]
    else:
        return row["choice2"], row["choice1"]

In [175]:
biases[["correct", "incorrect"]] = biases.apply(get_correct_incorrect, axis=1, result_type="expand")

In [178]:
def get_choices(row):
    choices = ["correct", "incorrect"]
    random.shuffle(choices)
    return row[choices[0]], row[choices[1]], choices[0], choices[1]    

In [179]:
biases[["choice1", "choice2", "choice1_type", "choice2_type"]] = biases.apply(get_choices, axis=1, result_type="expand")

In [195]:
biases["MCQ_id"] = random.sample(range(5002, 6000), 155)
biases["MCQ_id"] = "MC" + biases["MCQ_id"].astype(str)

In [197]:
def get_id_tag(row):
    if row["choice1_type"] == "incorrect":
        return row["MCQ_id"] + "21"
    else:
        return row["MCQ_id"] + "12"
    

In [198]:
biases["MCQ_id"] = biases.apply(get_id_tag, axis=1)

In [204]:
biases.to_csv("TRUE_bias.csv", index=False)

### Making the entire dataset

In [427]:
trivia_qa = pd.read_csv("trivia_qa_filtered.csv")
jeopardy = pd.read_csv("jeopardy_filtered.csv")
quail = pd.read_csv("quail_filtered.csv")
big_bench = pd.read_csv("big_bench_filtered.csv")
mmlu = pd.read_csv("mmlu_filtered.csv")
biases = pd.read_csv("TRUE_bias.csv")

In [428]:
trivia_qa.loc[~(trivia_qa["question"].str.contains("\?")), "question"] = trivia_qa.loc[~(trivia_qa["question"].str.contains("\?")), "question"] + "?"

In [429]:
jeopardy = jeopardy.sample(50)

In [430]:
dataset = pd.concat([trivia_qa, jeopardy, quail, big_bench, mmlu])

In [431]:
dataset["MCQ_id"] = random.sample(range(1000, 5001), 1000)
dataset["MCQ_id"] = "MC" + dataset["MCQ_id"].astype(str)

In [70]:
null_rows = dataset[dataset["question"].isna()]
null_rows["source"]

Series([], Name: source, dtype: object)

In [97]:
def get_choices(row):
    choices = ["correct", "incorrect"]
    random.shuffle(choices)
    return row[choices[0]], row[choices[1]], choices[0], choices[1]

In [99]:
dataset[["choice1", "choice2", "choice1_type", "choice2_type"]] = dataset.apply(get_choices, axis=1, result_type="expand")

In [115]:
def get_id_tag(row):
    if row["choice1_type"] == "incorrect":
        return row["MCQ_id"] + "21"
    else:
        return row["MCQ_id"] + "12"

In [116]:
dataset["MCQ_id"] = dataset.apply(get_id_tag, axis=1)

In [121]:
dataset.to_csv("objective_dataset.csv", index=False)

Reverse the ordering of choices

In [346]:
def get_combined(dataset):
    dataset_copy = dataset.copy()
    
    def get_reversed_choices(row):
        ID = row["MCQ_id"][:-2]
        tag = row["MCQ_id"][-2:][::-1]    
        return row["choice2"], row["choice1"], row["choice2_type"], row["choice1_type"], ID + tag
    
    dataset_copy[["choice1", "choice2", "choice1_type", "choice2_type", "MCQ_id"]] = dataset_copy.apply(get_reversed_choices, axis=1, result_type="expand")

    def get_updated_prompt_response_group(row):
        return f"Prompt: {row['prompt']}\n\nResponse A: {row['choice1']}\n\nResponse B: {row['choice2']}"

    dataset_copy["prompt_response_group"] = dataset_copy.apply(get_updated_prompt_response_group, axis=1)

    combined_dataset = pd.concat([dataset, dataset_copy])
    return combined_dataset

In [347]:
entire_combined = pd.read_csv("entire_TRUE_dataset.csv")

In [348]:
# def get_id(row):
#     return row["MCQ_id"][:-2]

# entire_combined["ID"] = entire_combined.apply(get_id, axis=1)
# entire_combined = entire_combined.drop_duplicates("ID")

In [352]:
entire_combined.shape

(1150, 31)

In [350]:
from sklearn.model_selection import ShuffleSplit
import numpy as np

def create_cross_val_splits(data, n_splits=5, random_state=42):
    kf = ShuffleSplit(n_splits=n_splits, test_size=0.4, random_state=random_state)
    splits = []
    
    for train_idx, val_idx in kf.split(data):
        train_data = data.iloc[train_idx]
        val_data = data.iloc[val_idx]
        
        val_size = len(val_data)
        split_point = val_size // 2
        
        test_data = val_data.iloc[:split_point]
        cal_data = val_data.iloc[split_point:]
        
        splits.append({
            'train': train_data,
            'val': test_data,
            'cal': cal_data
        })
    
    return splits


In [351]:
for index, split in enumerate(splits):
    get_combined(split["train"]).to_csv(f"TRUE_train_{index}.csv", index=False)
    get_combined(split["val"]).to_csv(f"TRUE_val_{index}.csv", index=False)
    get_combined(split["cal"]).to_csv(f"TRUE_cal_{index}.csv", index=False)

In [355]:
get_combined(entire_combined).to_csv("TRUE_dataset.csv", index=False)

Same format as the subjective dataset

In [361]:
lie = pd.read_csv("/nas/ucb/shivamsinghal/preference-learning-with-bounded-cognition/scalable_oversight/data_collection/truthful_qa/LIE_dataset.csv")

In [362]:
true = pd.read_csv("TRUE_dataset.csv")

In [365]:
lie.columns

Index(['categories', 'correct_concise', 'incorrect_concise', 'correct_detailed', 'choice1_type', 'choice2_type', 'correct_statements', 'incorrect_statements', 'incorrect_detailed', 'IDs', 'tags', 'tag_IDs', 'choice1', 'choice2', 'prompt', 'chosen', 'rejected', 'chosen_type', 'rejected_type', 'confidence_scores', 'confidence_difficulty', 'decision_time', 'number_of_clicks', 'scratch_space', 'raw_number_of_clicks', 'raw_decision_time', 'prompt_response_group'], dtype='object')

In [366]:
true.columns

Index(['correct', 'incorrect', 'MCQ_id', 'source', 'difficulty', 'choice1', 'choice2', 'choice1_type', 'choice2_type', 'chosen', 'rejected', 'chosen_type', 'rejected_type', 'confidence_scores', 'confidence_difficulty', 'decision_time', 'number_of_clicks', 'scratch_space', 'decision_time_beta', 'num_clicks_beta', 'confidence_beta', 'correct_chosen', 'raw_number_of_clicks', 'raw_decision_time', 'log_decision_time', 'log_norm_decision_time', 'prompt', 'prompt_response_group', 'category', 'other_info', 'ID'], dtype='object')

In [411]:
cols1 = set(true.columns)
cols2 = set(lie.columns)
common_cols = cols1.intersection(cols2)    
unique_to_df1 = cols1 - cols2
unique_to_df2 = cols2 - cols1

In [412]:
unique_to_df1

{'correct_chosen', 'difficulty', 'other_info', 'source'}

In [413]:
unique_to_df2

{'correct_concise',
 'correct_detailed',
 'incorrect_concise',
 'incorrect_detailed'}

In [376]:
def get_ids(row):
    return row["ID"][2:]
true["IDs"] = true.apply(get_ids, axis=1)

In [379]:
true = true.drop(["ID"], axis=1)

In [388]:
def get_tags(row):
    return row["MCQ_id"][-2:]
true["tags"] = true.apply(get_tags, axis=1)

In [390]:
true["tag_IDs"] = true["MCQ_id"]
true = true.drop(["MCQ_id"], axis=1)

In [394]:
true_cols = [col for col in true.columns if "beta" in col or "log" in col]
true_cols

['decision_time_beta',
 'num_clicks_beta',
 'confidence_beta',
 'log_decision_time',
 'log_norm_decision_time']

In [395]:
true = true.drop(true_cols, axis=1)

In [396]:
true["categories"] = true["category"]
true = true.drop("category", axis=1)

In [409]:
true["correct_statements"] = true["correct"]
true["incorrect_statements"] = true["incorrect"]

In [410]:
true = true.drop(["correct", "incorrect"], axis=1)

In [420]:
true.to_csv("TRUE_dataset.csv", index=False)