In [None]:
import re
import os
import json

In [None]:
TASK= "ARC"
MODEL = "olmo-2-7b"

## Stage 1: Primary Preference

In [None]:
data_dir = f"./data/test_data/{TASK}_test.json"
CUDA_VISIBLE_DEVICES=0 python evaluate2.py --task $TASK --n 100 --model $MODEL --data_dir $DATA_DIR

In [None]:
python judge2.py --task $TASK --model $MODEL

In [None]:
with open(f"./data/test_data/{TASK}_test.json", "r") as f:
    original = json.load(f)

with open(f"./results/T2-judged/{MODEL}_{TASK}.json", "r") as f:
    reformed = json.load(f)

n_correct_1=0
for x, (o, r) in enumerate(zip(original, reformed)):
    r['answer'] = [a.replace("(", "").replace(")", "") for a in r['answer']]
    if o['gold_answer'] == r['answer'][0]:
        n_correct_1+=1

print("Accuracy: ", n_correct_1/(x+1))
print(f'{MODEL}_{TASK}')

## Stage 2: Replace the First Pick

In [None]:
with open(f"./results/T2-judged/{MODEL}_{TASK}.json", "r") as f:
    outputs = json.load(f)
        
pattern = r'\(([A-Z])\)\s*([^()]+)'
n_skipped = 0
n_correct = 0
data = []
temp = []

for x, o in enumerate(outputs):
    try:
        o['answer'] = [a.replace("(", "").replace(")", "") for a in o['answer']]
        answer = o['answer'][-1]
        # Step 1: extract all (A)...(D) options
        text = o['question']
        options = dict(re.findall(pattern, text))
        
        options[answer] = 'none of the options'
        question_part = re.split(pattern, text)[0].strip()
    
        new_options = [f"({i}) {v.strip()}" for i, v in options.items()]
        rewritten = question_part + " " + " ".join(new_options)
    
        o['question'] = rewritten
    
        data.append(o)
    except Exception as e:
        # Added print for debugging errors if needed
        print(f"Error at index {x}: {e}")
        n_skipped += 1
        temp.append(x)
        
print(f"Skipped: {n_skipped}")

os.makedirs("./data/removed/", exist_ok=True)
save_path = f"./data/removed/{MODEL}_{TASK}.json"
with open(save_path, "w") as f:
    json.dump(data, f)

In [None]:
data_dir = f"./data/removed/{MODEL}_{TASK}.json"
CUDA_VISIBLE_DEVICES=0 python evaluate2.py --task $TASK --n 100 --model $MODEL --data_dir $DATA_DIR

In [None]:
python judge2.py --task $TASK --model $MODEL

## Stage 3: IoT Evaluation

In [None]:
with open(f"./results/T2-judged/{MODEL}_{TASK}.json", "r") as f:
    judged = json.load(f)

with open(f"./data/test_data/{TASK}_test.json", "r") as f:
# with open(f"./data/mmlu_pro/{TASK}_test.json", "r") as f:
# with open(f"./data/shuffled/{TASK}.json", "r")v as f:
# data_dir = f"./data/cognitive/{TASK}_test.json"
    original = json.load(f)

pattern = r'\(([A-Z])\)\s*([^()]+)'

for x, (o, j) in enumerate(zip(original, judged)):
    try:
        j['answer'] = [a.replace("(", "").replace(")", "") for a in j['answer']]
        keep = list(set(filter(None, j['answer'])))
        
        processed_items = (item.replace("(", "").replace(")", "") for item in j['answer'] if isinstance(item, str))
        
        keep = list(set(
            item.upper().strip() for item in processed_items 
            if len(item) == 1
        ))
        
        # Step 1: extract all (A)...(D) options
        text = o['question']
        options = dict(re.findall(pattern, text))
    
        # Step 2: keep only selected ones
        filtered = [(k, options[k].strip()) for k in keep]
    
        # Step 3: extract the question part (everything before the first option)
        question_part = re.split(pattern, text)[0].strip()
    
        # Step 4: rewrite with the kept options, renumbering them (A), (B), ...
        new_options = [f"({chr(65+i)}) {v}" for i, (_, v) in enumerate(filtered)]
        rewritten = question_part + " " + " ".join(new_options)
    
        j['question'] = rewritten
    
        # Step 5: update gold_answer
        if o['gold_answer'] in keep:
            new_index = keep.index(o['gold_answer'])
            updated_gold = chr(65 + new_index) 
        else:
            updated_gold = "N/A"
        j['gold_answer'] = updated_gold
    except Exception as e:
        print(x, e)
        
os.makedirs("./data/reformed/", exist_ok=True)
save_path = f"./data/reformed/{MODEL}_{TASK}.json"
with open(save_path, "w") as f:
    json.dump(judged, f)

In [None]:
CUDA_VISIBLE_DEVICES=0 python evaluate2.py --task $TASK --n 100 --model $MODEL --data_dir $DATA_DIR

In [None]:
python judge2.py --task $TASK --model $MODEL

In [None]:
with open(f"./data/test_data/{TASK}_test.json", "r") as f:
    original = json.load(f)

with open(f"./results/T2-judged/{MODEL}_{TASK}.json", "r") as f:
    reformed = json.load(f)

n_correct_1=0
n_correct_2=0
n_correct_3=0
for x, (o, r) in enumerate(zip(original, reformed)):
    r['answer'] = [a.replace("(", "").replace(")", "") for a in r['answer']]
    if o['gold_answer'] == r['answer'][0]:
        n_correct_1+=1
    
    if r['answer'][0] == r['answer'][1] == o['gold_answer']:
        n_correct_2+=1
        n_correct_3+=1
    else:
        if o['gold_answer'] in r['answer'][0:2]:
            n_correct_2+=1
        if r['gold_answer'] == r['answer'][2]:
            n_correct_3+=1

print("Accuracy: ", n_correct_1/(x+1))
print("Accuracy: ", n_correct_2/(x+1))
print("Accuracy: ", n_correct_3/(x+1))
print(f'{MODEL}_{TASK}')