In [None]:
import json
import re
from typing import List

import numpy as np
import pandas as pd
from datasets import load_dataset

EXPERIMENT_NAME = 'jansen2021dunningkruger/exp1.csv'

ds = load_dataset("marcelbinz/Psych-101")['train']
selected_rows = []
for row in ds:
    if row['experiment'] == EXPERIMENT_NAME:
        selected_rows.append(row)

### (Optional) Inspect the data row

In [None]:
print(selected_rows[0]['text'])

### parse the human response

In [None]:
def parse_double_angle_tokens_dynamic(text: str) -> List[int]:
    """
    Extracts all << >> tokens in order from the grammar quiz text.
    For each question, it dynamically determines the order of labels (e.g. V,E,H,J,G)
    from the "The choices are:" block and maps each <<X>> press to its correct index
    (1-based within that local ordering).
    Other numeric answers (like <<4>> or <<50>>%) are parsed as integers.
    """
    choice_blocks = re.findall(
        r"The choices are:\s*((?:[A-Z]:[^\n]+\n?)+)",
        text, flags=re.MULTILINE
    )

    label_orders = []
    for block in choice_blocks:
        labels = re.findall(r"([A-Z]):", block)
        label_orders.append(labels)

    tokens = re.findall(r"<<\s*(.*?)\s*>>", text)

    results = []
    q_idx = 0
    for tok in tokens:
        tok_clean = tok.strip().upper()
        if len(tok_clean) == 1 and tok_clean.isalpha():
            if q_idx < len(label_orders):
                labels = label_orders[q_idx]
                if tok_clean in labels:
                    results.append(labels.index(tok_clean) + 1)
                else:
                    raise ValueError(f"Unexpected label '{tok_clean}' not in {labels}")
            else:
                raise ValueError(f"Out-of-range question index {q_idx}")
            q_idx += 1
        else:
            val = re.sub(r"[^\d\.\-\+]", "", tok_clean)
            if val:
                results.append(int(float(val)))
            else:
                raise ValueError(f"Unrecognized token <<{tok}>>")

    return results

human_response = []
for row in selected_rows:
    human_response.append(parse_double_angle_tokens_dynamic(row['text']))

human_response_array = np.array(human_response)

### save human response to .csv file

In [None]:
df = pd.DataFrame(human_response_array)
df.insert(0, 'QKEY', range(2000001, 2000001 + len(df)))
df.insert(df.shape[1], 'WEIGHT_W1', 1.0)
df.columns = (
    ['QKEY']
    + ['pre_accuracy', 'pre_percentile', 'pre_average_difficulty', 'pre_self_difficulty']
    + [f'Q{i+1}_W1' for i in range(df.shape[1] - 1 - 4 - 4 - 1)]
    + ['post_accuracy', 'post_percentile', 'post_average_difficulty', 'post_self_difficulty']
    + ['WEIGHT_W1']
)
df.to_csv('dunning_kruger_responses.csv', index=False)

### generate the question strings json file

In [None]:
row = selected_rows[0]['text']

# question_intro is taken from the original dataset
# you can inspect the second notebook cell for details
question_intro = (
    "Some part of the sentence is in square brackets.\n"
    + "Five choices for rephrasing that part follow the sentence; "
    + "one choice repeats the original, and the other four are different.\n"
    + "Your task is to select the grammatically correct choice."
)

# split the text into question parts
parts = re.split(r'(?=(Q\d+\.\s))', row)
parts = [part for part in parts if "The choices are:" in part]

question_strings_dict = {}
for idx, part in enumerate(parts):
    cleaned = re.sub(r'^Q\d+\.\s*', '', part, flags=re.MULTILINE).split("The choices are:")[0].strip()
    question_strings_dict[f'Q{idx+1}_W1'] = question_intro + "\n\n" + cleaned

In [None]:
with open('dunning_kruger_question_strings.json', 'w') as f:
    json.dump(question_strings_dict, f, indent=4)

### generate the options map json file

In [None]:
options_map_dict = {}
for idx, part in enumerate(parts):
    choices_block = re.search(r"The choices are:\s*((?:[A-Z]:[^\n]+\n?)+)", part, flags=re.MULTILINE)
    cleaned = re.sub(r'^Q\d+\.\s*', '', part, flags=re.MULTILINE).split("The choices are:")[1].strip()
    options: List[str] = []
    for line in cleaned.split('\n'):
        if ":" not in line:
            continue
        label, option_text = line.split(":", 1)
        options.append(option_text.strip())
    assert len(options) == 5, f"Expected 5 options, got {len(options)} in question {idx+1}"
    options_map_dict[f'Q{idx+1}_W1'] = {}
    for opt_idx, opt_text in enumerate(options):
        options_map_dict[f'Q{idx+1}_W1'][float(opt_idx + 1)] = opt_text

In [None]:
with open('dunning_kruger_options_map.json', 'w') as f:
    json.dump(options_map_dict, f, indent=4)

### generation the option strings json file

In [None]:
option_strings_dict = {}
for qkey in question_strings_dict:
    options = options_map_dict[qkey]
    qstring = question_strings_dict[qkey]
    
    full_question = "Question: " + qstring + "\n"
    for opt_idx, opt_text in options.items():
        full_question += chr(ord('A') + int(opt_idx) - 1) + f". {opt_text}\n"
    full_question += "\n"

    for opt_idx, opt_text in options.items():
        key = f"{qkey}_option_{int(opt_idx)}"
        value = full_question + "Answer: " + chr(ord('A') + int(opt_idx) - 1) + ". " + opt_text
        option_strings_dict[key] = value

In [None]:
with open('dunning_kruger_option_strings.json', 'w') as f:
    json.dump(option_strings_dict, f, indent=4)