In [1]:
import pandas as pd
import json
import random
from openai import OpenAI
from time import sleep
from tqdm.notebook import tqdm
import numpy as np
import re
import pickle as pkl
import math
import os

import concurrent.futures
from functools import partial

import mwparserfromhell
import pandas as pd
import gensim.downloader as api
from gensim.corpora import WikiCorpus
from multiprocessing import Pool

import pickle

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 100000)
pd.set_option('display.max_colwidth', None)  

In [3]:
gpt_api_key = input("Enter your OpenAI API key: ")
nvidia_api_key = input("Enter your NVIDIA API key: ")

gpt_client = OpenAI(
    api_key=gpt_api_key,
)
os.environ["OPENAI_API_KEY"] = gpt_api_key

llama_client = OpenAI(
    base_url = "https://integrate.api.nvidia.com/v1",
    api_key = nvidia_api_key
)

KeyboardInterrupt: Interrupted by user

In [3]:
TRAIN_PATH = "train_llama3-70b_regenerated_detailed.csv"
train_set = pd.read_csv(TRAIN_PATH)

TEST_PATH = "test_llama3-70b_regenerated_detailed.csv"
test_set = pd.read_csv(TEST_PATH)

In [None]:
def correct_chosen(row):
    return int("incorrect" not in row["chosen_type"])

train_set["correct_chosen"] = train_set.apply(correct_chosen, axis=1)

In [None]:
train_questions = train_set.drop_duplicates("questions")["questions"].tolist()
test_questions = test_set.drop_duplicates("questions")["questions"].tolist()

# Betas being tested
## Annotator-specific betas
- annotator confidence
- time spent per question — but different people might spend different amounts of time, so will need to justify this
- number of clicks on the question page

## Ground truth betas / other baselines:
- Length and correctness agree with each other
- GPT-3.5 picks answer 10 times and compared to the provided correct answer
easy questions only based on GPT-3.5 difficulty
- GPT-4o and GPT-4-turbo answer once and compared to the provided correct answer
- Artificially labeled dataset with five fold validation
- Wiki dataset frequency
- Flesch-kincade

## Prompting
- GPT-3.5 picks answer 10 times, and GPT-4 judges (turbo and o)
- Zero-shot prompting using the criteria of annotator knowledge, resources (e.g., time), and cognitive biases. This will involve the model evaluating the questions + the two responses.
- CoT autograder — getting holistic scores (i.e., we get the LLM to give scores for all questions in one go and then get one holistic difficulty score)
    - autograder
    - simpler CoT prompt with reused zero shot prompt
- Individually asking the questions that we had for the autograder
- Few shot prompting (either one shot or two shot): depending on the cost just with GPT-3.5?
- Try prompting individually for the cognitive biases too?
- Potentially fine-tuning on a cogsci dataset?
- Maybe tree-of-thought prompting?

## Models to try:
- GPT-3.5
- GPT-4-turbo
- GPT-4o
- Llama3-8B
- Llama3-70B

### Ground truth beta based on frequency in Wiki text

In [None]:
import logging
from gensim.corpora import WikiCorpus
from gensim.models import TfidfModel
from tqdm.notebook import tqdm

# Set up logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Path to the Wikipedia dump
wiki_bz2 = "/nas/ucb/shivamsinghal/preference-learning-with-bounded-cognition/enwiki-latest-pages-articles.xml.bz2"

# Load the corpus and wrap it with tqdm for a progress bar
class TqdmWikiCorpus(WikiCorpus):
    def get_texts(self):
        return tqdm(super().get_texts(), desc='Processing documents')

wiki_corpus = TqdmWikiCorpus(wiki_bz2)

# Create a TF-IDF model
tfidf = TfidfModel(wiki_corpus)


In [None]:
questions_set = set(train_set['questions'].str.lower())

def find_question_frequency():
    question_freq = {q: 0 for q in questions_set}
    for text in tqdm(wiki_corpus.get_texts()):
        parsed_text = mwparserfromhell.parse(" ".join(text))
        for question in questions_set:
            if question in parsed_text.lower():
                question_freq[question] += 1
    return question_freq

question_frequency = find_question_frequency()


### Ground truth beta based on length
- Beta = 0.5 when the length of the statements is the same, or the factual correctness is the same
- Beta = 0 when the length and the correctness disagree (correct statement is concise)
- Beta = 1 when the length and the correctness agree (correct statement is detailed)

In [None]:
def get_ground_truth_difficulty(row):
    tag = str(row["tag_IDs"])[:4]
    if tag[0] == tag[2] or tag[1] == tag[3]:
        return 0.5
    elif (tag[0] == "1" and tag[1] == "2") or (tag[2] == "1" and tag[3] == "2"):
        return 0
    elif (tag[0] == "1" and tag[1] == "1") or (tag[2] == "1" and tag[3] == "1"):
        return 1

In [None]:
train_set["ground_truth_difficulty"] = train_set.apply(get_ground_truth_difficulty, axis=1)
train_set["ground_truth_inverse_beta"] = 1-train_set["ground_truth_difficulty"]

In [None]:
train_set.to_csv(TRAIN_PATH, index=False)

### Ground truth beta based on correctness across 4 copies of question

In [None]:
correct_incorrect_answer_pairs = train_set[train_set["tag_IDs"].apply(lambda x: str(x)[0] != str(x)[2])]

In [None]:
sum_data = correct_incorrect_answer_pairs.groupby('questions')['correct_chosen'].sum()
count_data = correct_incorrect_answer_pairs.groupby('questions')['correct_chosen'].count()
result = dict(sum_data / count_data)

In [None]:
def get_ground_truth(row):
    if str(row["tag_IDs"])[0] != str(row["tag_IDs"])[2]:
        return result[row["questions"]]
    else:
        return 0.5

In [None]:
train_set["ground_truth_approx_correctness_difficulty"] = train_set.apply(get_ground_truth, axis=1)

In [9]:
train_set["ground_truth_approx_correctness_beta"] = 1-train_set["ground_truth_approx_correctness_difficulty"]

In [10]:
train_set.to_csv(TRAIN_PATH, index=False)

### Artifically labeled dataset
- When there are correct and incorrect pairs, just choose the correct answer
- When there are two statements of the same correctness are paired together, choose randomly.

In [4]:
def get_chosen_rejected(row):
    tag = str(row["tag_IDs"])[:4]
    if tag[0] == tag[2]:
        # choices = ["choice1", "choice2"]
        # random.shuffle(choices)
        # return row[choices[0]], row[choices[1]], row[f"{choices[0]}_type"], row[f"{choices[1]}_type"]
        chosen = "choice1" if tag[1] == "2" else "choice2"
        rejected = "choice2" if chosen == "choice1" else "choice1"
        return row[chosen], row[rejected], row[f"{chosen}_type"], row[f"{rejected}_type"]
    elif tag[0] == "1":
        return row["choice1"], row["choice2"], row["choice1_type"], row["choice2_type"]
    else:
        return row["choice2"], row["choice1"], row["choice2_type"], row["choice1_type"]

In [5]:
train_set[["chosen", "rejected", "chosen_type", "rejected_type"]] = train_set.apply(get_chosen_rejected, axis=1, result_type='expand')

In [None]:
from sklearn.model_selection import GroupKFold

train_set['question_id'] = train_set.index // 4
gkf = GroupKFold(n_splits=5)
fold_number = 1
for train_index, test_index in gkf.split(train_set, groups=train_set['question_id']):
    fold_train = train_set.iloc[train_index]
    fold_test = train_set.iloc[test_index]

    fold_train.to_csv(f'train_artificial_fold_{fold_number}.csv', index=False)
    fold_test.to_csv(f'test_artificial_fold_{fold_number}.csv', index=False)
    
    print(f"Training and testing datasets for fold {fold_number} saved.")
    fold_number += 1

In [14]:
train_set.to_csv("artificial_train_dataset_llama70B.csv", index=False)

In [12]:
train_set.groupby("chosen_type").size()

chosen_type
correct_concise       627
correct_detailed      272
incorrect_detailed     98
dtype: int64

In [13]:
old.groupby("chosen_type").size()

chosen_type
correct_concise       692
correct_detailed      207
incorrect_concise      53
incorrect_detailed     45
dtype: int64

### GPT-3.5 answers 10 times / GPT-4-turbo and GPT-4o answer once
- judged based on provided correct answer
- judged by GPT-4-turbo / o

Get GPT-3.5 to answer the same question multiple times and count the number of times that it gets it correct
- When the choices differ in their factual correctness, the factually correct one is correct.
- When the choices are both incorrect, the concise one would be considered correct. Even though there is technically no correct answer, it makes sense that we wouldn't want an LLM to output detailed information that is wrong.
- Whent the choices are both correct, either the detailed or the concise one would be considered correct. Here, the difficulty is simply set to 0.

In [None]:
def generate_output_gpt(question, model="gpt-3.5-turbo"):
    chat_completion = gpt_client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "Please answer the multiple choice question using only one letter, either 'A' or 'B'."  
            },
            {
                "role": "user",
                "content": question
            }
        ],
        model=model,
    )
    return chat_completion.choices[0].message.content

In [None]:
# train set mcqs generation
train_set_mcqs = []
train_set_mcq_answers = []
for index, row in train_set.iterrows():
    choices_order = random.sample(["choice1", "choice2"], 2)
    qs = f"{row['questions']} (A.) {row[choices_order[0]]} (B.) {row[choices_order[1]]}"
    train_set_mcqs.append(qs)

    if row["choice1_type"].split("_")[0] == "correct" and row["choice2_type"].split("_")[0] == "correct":
        train_set_mcq_answers.append(None)
        continue
        
    if row["choice1_type"].split("_")[0] == "incorrect" and row["choice2_type"].split("_")[0] == "incorrect":
        chosen = "choice1" if row["choice1_type"].split("_")[1] == "concise" else "choice2"
    else:
        chosen = "choice1" if row["choice1_type"].split("_")[0] == "correct" else "choice2"
    
    chosen = "A" if choices_order[0] == chosen else "B"
    train_set_mcq_answers.append(chosen)

In [None]:
gpt_4_turbo_answers = []
partial_generate_output_gpt = partial(generate_output_gpt, model="gpt-4-turbo")
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(partial_generate_output_gpt, train_set_mcqs), total=len(train_set_mcqs)))
gpt_4_turbo_answers.extend(results)

In [None]:
gpt_4o_answers = []
partial_generate_output_gpt = partial(generate_output_gpt, model="gpt-4o")
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(partial_generate_output_gpt, train_set_mcqs), total=len(train_set_mcqs)))
gpt_4o_answers.extend(results)

In [None]:
for i in range(len(gpt_4_turbo_answers)):
    answer = gpt_4_turbo_answers[i]
    if answer not in ["A", "B"] and answer[0] not in ["A", "B"]:
        print(answer)
    else:
        gpt_4_turbo_answers[i] = answer[0]

In [None]:
for i in range(len(gpt_4o_answers)):
    answer = gpt_4o_answers[i]
    if answer not in ["A", "B"] and answer[0] not in ["A", "B"]:
        print(answer)
    else:
        gpt_4o_answers[i] = answer[0]
        continue

    if answer[1] not in ["A", "B"]:
        print(answer)
    else:
        gpt_4o_answers[i] = answer[1]

In [None]:
def process_question_gt(q):
    if train_set_mcq_answers[q] is None:
        return 10
    score = 0
    for i in range(10):
        response = generate_output_gpt(train_set_mcqs[q])
        while response[0] not in {"A", "B"}:
            print("IN LOOP")
            response = generate_output_gpt(train_set_mcqs[q])
        if response[0] == train_set_mcq_answers[q]:
            score += 1
    return score

In [None]:
def process_question_gpt4(q, gpt_4_answers):
    score = 0
    for i in range(10):
        response = generate_output_gpt(train_set_mcqs[q])
        while response[0] not in {"A", "B"}:
            print("IN LOOP")
            response = generate_output_gpt(train_set_mcqs[q])
        if response[0] == gpt_4_answers[q]:
            score += 1
    return score

In [None]:
train_scores_gt = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_question_gt, q) for q in tqdm(range(len(train_set_mcqs)))]
    for future in concurrent.futures.as_completed(futures):
        train_scores_gt.append(future.result())

In [None]:
train_set["gpt-3.5_ground_truth_difficulty"] = np.array(train_scores_gt)/10
train_set["gpt-3.5_ground_truth_inverse_beta"] = 1-train_set["gpt-3.5_ground_truth_difficulty"]

In [None]:
train_scores_gpt4 = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_question_gpt4, q, gpt_4_turbo_answers) for q in tqdm(range(len(train_set_mcqs)))]
    for future in concurrent.futures.as_completed(futures):
        train_scores_gpt4.append(future.result())

In [None]:
train_set["gpt-3.5_gpt-4-turbo_judge_difficulty"] = np.array(train_scores_gpt4)/10
train_set["gpt-3.5_gpt-4-turbo_judge_inverse_beta"] = 1-train_set["gpt-3.5_gpt-4-turbo_judge_difficulty"]

In [None]:
train_scores_gpt4o = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_question_gpt4, q, gpt_4o_answers) for q in tqdm(range(len(train_set_mcqs)))]
    for future in concurrent.futures.as_completed(futures):
        train_scores_gpt4o.append(future.result())

In [None]:
train_set["gpt-3.5_gpt-4o_judge_difficulty"] = np.array(train_scores_gpt4o)/10
train_set["gpt-3.5_gpt-4o_judge_inverse_beta"] = 1-train_set["gpt-3.5_gpt-4o_judge_difficulty"]

In [None]:
gpt_4_turbo_gt_comparison = []
for i in range(len(gpt_4_turbo_answers)):
    if train_set_mcq_answers[i] is None:
        gpt_4_turbo_gt_comparison.append(1)
        continue
    answer = gpt_4_turbo_answers[i]
    if answer == train_set_mcq_answers[i]:
        gpt_4_turbo_gt_comparison.append(1)
    else:
        gpt_4_turbo_gt_comparison.append(0)

In [None]:
train_set["gpt-4-turbo_ground_truth_difficulty"] = np.array(gpt_4_turbo_gt_comparison)
train_set["gpt-4-turbo_ground_truth_beta"] = 1-train_set["gpt-4-turbo_ground_truth_difficulty"]

In [None]:
gpt_4o_gt_comparison = []
for i in range(len(gpt_4o_answers)):
    if train_set_mcq_answers[i] is None:
        gpt_4o_gt_comparison.append(1)
        continue
    answer = gpt_4o_answers[i]
    if answer == train_set_mcq_answers[i]:
        gpt_4o_gt_comparison.append(1)
    else:
        gpt_4o_gt_comparison.append(0)

In [None]:
train_set["gpt-4o_ground_truth_difficulty"] = np.array(gpt_4o_gt_comparison)
train_set["gpt-4o_ground_truth_beta"] = 1-train_set["gpt-4o_ground_truth_difficulty"]

In [None]:
train_set.to_csv(TRAIN_PATH, index=False)

## Zero-shot Prompting

Evaluating both the question and answer groups without examples 

Prompt is here: difficulty_evaluation_prompt_zero_shot.txt

In [None]:
with open("difficulty_evaluation_prompt_zero_shot.txt", "r") as f:
    autograder_prompt = f.read()

In [None]:
def generate_zero_shot_output(question, response1, response2, model="gpt-3.5-turbo", temperature=0.0):
    chat_completion = gpt_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": autograder_prompt.format(question=question, response1=response1, response2=response2)
            }
        ],
        model=model,
        temperature=temperature
    )
    return chat_completion.choices[0].message.content

In [None]:
def generate_response(row, model="gpt-3.5-turbo"):
    response = generate_zero_shot_output(row["questions"], row["choice1"], row["choice2"], model=model)
    return response

In [None]:
gpt_35_zero_shot = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(generate_response, row) for _, row in train_set.iterrows()]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        gpt_35_zero_shot.append(future.result())

In [None]:
gpt_4o_zero_shot = []

partial_generate_response = partial(generate_response, model="gpt-4o")

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(partial_generate_response, row) for _, row in train_set.iterrows()]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        gpt_4o_zero_shot.append(future.result())

In [None]:
gpt_4_turbo_zero_shot = []

partial_generate_response = partial(generate_response, model="gpt-4-turbo")

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(partial_generate_response, row) for _, row in train_set.iterrows()]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        gpt_4_turbo_zero_shot.append(future.result())

In [None]:
def extract_score_format(arr):
    pattern = r"SCORE:\s*\d+"
    matches = []
    for string in arr:
        match = re.search(pattern, string)
        if match:
            # print(match.group())
            matches.append(match.group())
        else:
            print(string)
            
    return get_score_num(matches)

def get_score_num(arr):
    nums = []
    pattern = r"SCORE:\s*(\d+)"
    for string in arr:
        match = re.search(pattern, string)
        if match:
            nums.append(float(match.group(1)))
            
    return nums

In [None]:
matches = extract_score_format(gpt_35_zero_shot)
assert len(matches)==len(train_set)
gpt_35_zero_shot = matches

In [None]:
matches = extract_score_format(gpt_4_turbo_zero_shot)
assert len(matches)==len(train_set)
gpt_4_turbo_zero_shot = matches

In [None]:
matches = extract_score_format(gpt_4o_zero_shot)
assert len(matches)==len(train_set)
gpt_4o_zero_shot = matches

In [None]:
train_set["gpt-3.5_zero_shot_difficulty"] = gpt_35_zero_shot
train_set["gpt-4-turbo_zero_shot_difficulty"] = gpt_4_turbo_zero_shot
train_set["gpt-4o_zero_shot_difficulty"] = gpt_4o_zero_shot

In [None]:
max_val = train_set["gpt-3.5_zero_shot_difficulty"].max()
min_val = train_set["gpt-3.5_zero_shot_difficulty"].min()

train_set["gpt-3.5_zero_shot_beta"] = max_val + min_val - train_set["gpt-3.5_zero_shot_difficulty"]

In [None]:
max_val = train_set["gpt-4-turbo_zero_shot_difficulty"].max()
min_val = train_set["gpt-4-turbo_zero_shot_difficulty"].min()

train_set["gpt-4-turbo_zero_shot_beta"] = max_val + min_val - train_set["gpt-4-turbo_zero_shot_difficulty"]

In [None]:
max_val = train_set["gpt-4o_zero_shot_difficulty"].max()
min_val = train_set["gpt-4o_zero_shot_difficulty"].min()

train_set["gpt-4o_zero_shot_beta"] = max_val + min_val - train_set["gpt-4o_zero_shot_difficulty"]

In [None]:
train_set.to_csv(TRAIN_PATH, index=False)

### CoT Prompting
Prompt is here: difficulty_evaluation_prompt_cot.txt

In [None]:
with open("difficulty_evaluation_prompt_cot.txt", "r") as f:
    autograder_prompt = f.read()

In [None]:
def generate_cot_output(question, response1, response2, model="gpt-3.5-turbo", temperature=0.0):
    chat_completion = gpt_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": autograder_prompt.format(question=question, response1=response1, response2=response2)
            }
        ],
        model=model,
        temperature=temperature
    )
    return chat_completion.choices[0].message.content

In [None]:
def generate_response(row, model="gpt-3.5-turbo"):
    response = generate_cot_output(row["questions"], row["choice1"], row["choice2"], model=model)
    return response

In [None]:
gpt_35_cot = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(generate_response, row) for _, row in train_set.iterrows()]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        gpt_35_cot.append(future.result())

In [None]:
gpt_4o_cot = []

partial_generate_response = partial(generate_response, model="gpt-4o")

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(partial_generate_response, row) for _, row in train_set.iterrows()]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        gpt_4o_cot.append(future.result())

In [None]:
gpt_4_turbo_cot = []

partial_generate_response = partial(generate_response, model="gpt-4-turbo")

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(partial_generate_response, row) for _, row in train_set.iterrows()]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        gpt_4_turbo_cot.append(future.result())

In [None]:
def get_scores_and_reasonings(generated_array):
    question1_scores, question2_scores, question3_scores, question4_scores, question5_scores, question6_scores, question7_scores = ([] for _ in range(7))
    question1_reasonings, question2_reasonings, question3_reasonings, question4_reasonings, question5_reasonings, question6_reasonings, question7_reasonings = ([] for _ in range(7))
    
    scores_lists = [question1_scores, question2_scores, question3_scores, question4_scores, question5_scores, question6_scores, question7_scores]
    reasonings_lists = [question1_reasonings, question2_reasonings, question3_reasonings, question4_reasonings, question5_reasonings, question6_reasonings, question7_reasonings]
    
    errors = []

    for index, data in enumerate(generated_array):
        try:
            segments = re.split(r'(\d+\.[ab])', data)
            assert len(segments) == 29
            
            current_key = ''
            for segment in segments:
                if re.match(r'\d+\.[ab]', segment):
                    current_key = segment
                else:
                    if current_key and segment.strip():
                        question_number, part = current_key.split('.')
                        question_index = int(question_number) - 1
                        if part == 'a':
                            reasonings_lists[question_index].append(segment.strip())
                        elif part == 'b':
                            scores_lists[question_index].append(float(segment.strip()))
        except Exception as e:
            errors.append(index)
            for scores, reasonings in zip(scores_lists, reasonings_lists):
                scores.append(None)
                reasonings.append(None)

    return scores_lists, reasonings_lists, errors


In [None]:
gpt_35_scores_lists, gpt_35_reasonings_lists, errors = get_scores_and_reasonings(gpt_35_cot)

In [None]:
scores = [[] for _ in range(7)]
reasonings = [[] for _ in range(7)]

for output in gpt_35_cot:
    score, all_reasonings, errors = get_scores_and_reasonings([output])
    assert len(scores) == 7
    assert all(len(r) > 0 for r in all_reasonings)

    for i in range(7):
        scores[i].append(score[i][0])
        reasonings[i].append(all_reasonings[i][0])

question1_scores, question2_scores, question3_scores, question4_scores, \
question5_scores, question6_scores, question7_scores = scores

question1_reasonings, question2_reasonings, question3_reasonings, question4_reasonings, \
question5_reasonings, question6_reasonings, question7_reasonings = reasonings

In [None]:
for i in range(len(scores)):
    train_set[f"gpt-3.5_CoT_AG_question-{i+1}_difficulty_score"] = scores[i]
    train_set[f"gpt-3.5_CoT_AG_question-{i+1}_reasoning"] = reasonings[i]

In [None]:
columns = [f"gpt-3.5_CoT_AG_question-{i}_difficulty_score" for i in range(1, 7)]
train_set['gpt-3.5_CoT_AG_mean_difficulty_score'] = train_set[columns].mean(axis=1)
train_set['gpt-3.5_CoT_AG_max_difficulty_score'] = train_set[columns].max(axis=1)
train_set['gpt-3.5_CoT_AG_median_difficulty_score'] = train_set[columns].median(axis=1)

In [None]:
gpt_4_turbo_scores_lists, gpt_4_turbo_reasonings_lists, errors = get_scores_and_reasonings(gpt_4_turbo_cot)

In [None]:
scores = [[] for _ in range(7)]
reasonings = [[] for _ in range(7)]

for index in range(len(gpt_4_turbo_cot)):
    output = gpt_4_turbo_cot[index]
    score, all_reasonings, errors = get_scores_and_reasonings([output])
    assert len(score) == 7
    assert len(all_reasonings) == 7
    assert all(len(r) > 0 for r in all_reasonings)
    try:
        for i in range(7):
            # assert scores[i][0] is not None, i
            # assert all_reasonings[i][0] is not None, i
            scores[i].append(score[i][0])
            reasonings[i].append(all_reasonings[i][0])
    except:
        print(len(score))
        print(len(all_reasonings))
        print(all_reasonings)
        print(score)
        print(output)
        print(index)
        assert False

# manually correct for any statements that error out

In [None]:
for i in range(len(scores)):
    assert len(scores[i]) == 1000
    assert len(reasonings[i]) == 1000
    for j in range(len(scores[i])):
        assert reasonings[i][j] is not None, reasonings[i][j]
    train_set[f"gpt-4-turbo_CoT_AG_question-{i+1}_difficulty_score"] = scores[i]
    train_set[f"gpt-4-turbo_CoT_AG_question-{i+1}_reasoning"] = reasonings[i]

In [None]:
columns = [f"gpt-4-turbo_CoT_AG_question-{i}_difficulty_score" for i in range(1, 7)]
train_set['gpt-4-turbo_CoT_AG_mean_difficulty_score'] = train_set[columns].mean(axis=1)
train_set['gpt-4-turbo_CoT_AG_max_difficulty_score'] = train_set[columns].max(axis=1)
train_set['gpt-4-turbo_CoT_AG_median_difficulty_score'] = train_set[columns].median(axis=1)

In [None]:
gpt_4o_scores_lists, gpt_4o_reasonings_lists, errors = get_scores_and_reasonings(gpt_4o_cot)

In [None]:
scores = [[] for _ in range(7)]
reasonings = [[] for _ in range(7)]

for index in range(len(gpt_4o_cot)):
    output = gpt_4o_cot[index]
    score, all_reasonings, errors = get_scores_and_reasonings([output])
    assert len(score) == 7
    assert len(all_reasonings) == 7
    assert all(len(r) > 0 for r in all_reasonings)
    try:
        for i in range(7):
            scores[i].append(score[i][0])
            reasonings[i].append(all_reasonings[i][0])
    except:
        print(len(score))
        print(len(all_reasonings))
        print(score)
        print(output)
        print(index)
        assert False

# manually correct for any statements that error out

In [None]:
for i in range(len(scores)):
    train_set[f"gpt-4o_CoT_AG_question-{i+1}_difficulty_score"] = scores[i]
    train_set[f"gpt-4o_CoT_AG_question-{i+1}_reasoning"] = reasonings[i]

In [None]:
columns = [f"gpt-4o_CoT_AG_question-{i}_difficulty_score" for i in range(1, 7)]
train_set['gpt-4o_CoT_AG_mean_difficulty_score'] = train_set[columns].mean(axis=1)
train_set['gpt-4o_CoT_AG_max_difficulty_score'] = train_set[columns].max(axis=1)
train_set['gpt-4o_CoT_AG_median_difficulty_score'] = train_set[columns].median(axis=1)

In [None]:
train_set.to_csv(TRAIN_PATH, index=False)

Llama 70B executions

In [None]:
def get_chat(question, response1, response2):
    completion = llama_client.chat.completions.create(
        model="meta/llama3-70b-instruct",
        messages=[  
            {
                "role": "system",
                "content": autograder_prompt.format(question=question, response1=response1, response2=response2)
            }
        ],
        temperature=0.65,
        top_p=1,
        max_tokens=1024,
    )
    
    return completion.choices[0].message.content

def process_row(row):
    response = get_chat(row["questions"], row["incorrect_statements"])
    return response

In [None]:
generated_statements = []
for index, row in tqdm(train_set[len(generated_statements):].iterrows()):
    response = get_chat(row["questions"], row["choice1"], row["choice2"])
    generated_statements.append(response)

In [None]:
scores = [[] for _ in range(7)]
reasonings = [[] for _ in range(7)]

for index in range(len(generated_statements)):
    output = generated_statements[index]
    score, all_reasonings, errors = get_scores_and_reasonings([output])
    assert len(score) == 7
    assert len(all_reasonings) == 7
    assert all(len(r) > 0 for r in all_reasonings)
    try:
        for i in range(7):
            scores[i].append(score[i][0])
            reasonings[i].append(all_reasonings[i][0])
    except:
        print(len(score))
        print(len(all_reasonings))
        print(score)
        print(output)
        print(index)
        assert False

In [None]:
for i in range(len(scores)):
    train_set[f"llama_3-70B_CoT_AG_question-{i+1}_difficulty_score"] = scores[i]
    train_set[f"llama_3-70B_CoT_AG_question-{i+1}_reasoning"] = reasonings[i]

In [None]:
columns = [f"llama_3-70B_CoT_AG_question-{i}_difficulty_score" for i in range(1, 7)]
train_set['llama_3-70B_CoT_AG_mean_difficulty_score'] = train_set[columns].mean(axis=1)
train_set['llama_3-70B_CoT_AG_max_difficulty_score'] = train_set[columns].max(axis=1)
train_set['llama_3-70B_CoT_AG_median_difficulty_score'] = train_set[columns].median(axis=1)

In [None]:
train_set.dropna().to_csv(TRAIN_PATH, index=False)

### Simpler CoT
repurposing the zero shot prompt, but asking the LLM to reason explicitly

The prompt is here: difficulty_evaluation_prompt_simpler_cot.txt

In [None]:
with open("difficulty_evaluation_prompt_simpler_cot.txt", "r") as f:
    autograder_prompt = f.read()

In [None]:
def generate_simple_cot_output(question, response1, response2, model="gpt-3.5-turbo", temperature=0.0):
    chat_completion = gpt_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": autograder_prompt.format(question=question, response1=response1, response2=response2)
            }
        ],
        model=model,
        temperature=temperature
    )
    return chat_completion.choices[0].message.content

In [None]:
def generate_response(row, model="gpt-3.5-turbo"):
    response = generate_simple_cot_output(row["questions"], row["choice1"], row["choice2"], model=model)
    return response

In [None]:
gpt_35_cot_simpler = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(generate_response, row) for _, row in train_set.iterrows()]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        gpt_35_cot_simpler.append(future.result())

In [None]:
gpt_4o_cot_simpler = []

partial_generate_response = partial(generate_response, model="gpt-4o")

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(partial_generate_response, row) for _, row in train_set.iterrows()]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        gpt_4o_cot_simpler.append(future.result())

In [None]:
gpt_4_turbo_cot_simpler = []

partial_generate_response = partial(generate_response, model="gpt-4-turbo")

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(partial_generate_response, row) for _, row in train_set.iterrows()]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        gpt_4_turbo_cot_simpler.append(future.result())

In [None]:
def get_last_numeric_values(strings):
    numeric_values = []
    for string in strings:
        numbers = re.findall(r'[0-9]+(?:\.[0-9]+)?', string)
        if numbers:
            numeric_values.append(float(numbers[-1]))
    return numeric_values

In [None]:
train_set["gpt-3.5_simpler_CoT_difficulty_scores"] = get_last_numeric_values(gpt_35_cot_simpler)
train_set["gpt-4-turbo_simpler_CoT_difficulty_scores"] = get_last_numeric_values(gpt_4_turbo_cot_simpler)
train_set["gpt-4o_simpler_CoT_difficulty_scores"] = get_last_numeric_values(gpt_4o_cot_simpler)

In [None]:
train_set["gpt-3.5_simpler_CoT_reasoning"] = gpt_35_cot_simpler
train_set["gpt-4-turbo_simpler_CoT_reasoning"] = gpt_4_turbo_cot_simpler
train_set["gpt-4o_simpler_CoT_reasoning"] = gpt_4o_cot_simpler

In [None]:
train_set.to_csv(TRAIN_PATH, index=False)

### Individual CoT AG questions
Asking the LLM the questions from the AG separately but with the same preamble

The prompt is here: difficulty_evaluation_prompt_individual_question.txt

Use one of the following questions:
1. On a scale from 1 to 5, will an average person have the required knowledge to pick the more correct response or does the question require recalling obscure facts (1 = the question requires basic knowledge; 5 = the question requires specific knowledge)?
2. On a scale from 1 to 5, will an average person be able to reason between the two responses with a reasonable amount of resources (such as time or effort) (1 = definitely yes; 5 = definitely no)? 
3. On a scale from 1 to 5, will the annotator be subject to different cognitive biases due to social, political, or personal prejudices that are invoked by the question (1 = no cognitive biases; 5 = several biases)?
4. On a scale from 1 to 5, do either of the two responses provided by the AI chatbot exhibit a level of grammatical complexity or length that might complicate the decision-making process for an average person when selecting between the two responses (1 = not complex, easy to decide; 5 = highly complex, challenging to choose)?
5. On a scale form 1 to 5, is the content in either of the responses provided by the AI chatbot particularly misleading? Ask yourself the following questions: Is there manipulative or confusing language? Are there incorrect or unnecessary facts or statistics? Is incorrect logical reasoning used? (1 = no, the responses are not misleading; 5 = yes, one or both of the responses will mislead an average person)
6. Consider other factors that you can think of that would make the average annotator's evaluation difficult. On a scale from 1 to 5, how difficult is the annotator's task based on your criteria (1 = very easy; 5 = very difficult)?
7. Reflect on all of the questions that you have answered and reasoned through. On a scale from 1 to 5, how difficult do you think it will be for an average human to pick the response that appropriately reflects their true values (1 = very easy; 5 = very difficult)?

In [None]:
question1 = "On a scale from 1 to 5, will an average person have the required knowledge to pick the more correct response or does the question require recalling obscure facts (1 = the question requires basic knowledge; 5 = the question requires specific knowledge)?"
question2 = "On a scale from 1 to 5, will an average person be able to reason between the two responses with a reasonable amount of resources (such as time or effort) (1 = definitely yes; 5 = definitely no)?" 
question3 = "On a scale from 1 to 5, will the annotator be subject to different cognitive biases due to social, political, or personal prejudices that are invoked by the question (1 = no cognitive biases; 5 = several biases)?"
question4 = "On a scale from 1 to 5, do either of the two responses provided by the AI chatbot exhibit a level of grammatical complexity or length that might complicate the decision-making process for an average person when selecting between the two responses (1 = not complex, easy to decide; 5 = highly complex, challenging to choose)?"
question5 = "On a scale form 1 to 5, is the content in either of the responses provided by the AI chatbot particularly misleading? Ask yourself the following questions: Is there manipulative or confusing language? Are there incorrect or unnecessary facts or statistics? Is incorrect logical reasoning used? (1 = no, the responses are not misleading; 5 = yes, one or both of the responses will mislead an average person)"
question6 = "Consider other factors that you can think of that would make the average annotator's evaluation difficult. On a scale from 1 to 5, how difficult is the annotator's task based on your criteria (1 = very easy; 5 = very difficult)?"
question7 = "Reflect on all of the questions that you have answered and reasoned through. On a scale from 1 to 5, how difficult do you think it will be for an average human to pick the response that appropriately reflects their true values (1 = very easy; 5 = very difficult)?"

In [None]:
questions = [question1, question2, question3, question4, question5, question6, question7]

In [None]:
with open("difficulty_evaluation_prompt_individual_question.txt", "r") as f:
    autograder_prompt = f.read()

In [None]:
def generate_individual_cot_output(question, response1, response2, reasoning_question, model="gpt-3.5-turbo", temperature=0.0):
    chat_completion = gpt_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": autograder_prompt.format(question=question, response1=response1, response2=response2, reasoning_question=reasoning_question)
            }
        ],
        model=model,
        temperature=temperature
    )
    return chat_completion.choices[0].message.content


In [None]:
def generate_response(row, reasoning_question, model="gpt-3.5-turbo"):
    response = generate_individual_cot_output(row["questions"], row["choice1"], row["choice2"], reasoning_question, model=model)
    return response

In [None]:
gpt_35_cot_individual = [[] for _ in range(len(questions))]

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {}
    for i, question in enumerate(questions):
        futures[question] = []
        for _, row in train_set.iterrows():
            future = executor.submit(generate_response, row, question, model="gpt-3.5-turbo")
            futures[question].append(future)
    
    for i, question in enumerate(questions):
        gpt_35_cot_individual[i] = [future.result() for future in tqdm(concurrent.futures.as_completed(futures[question]), total=len(futures[question]), desc=f"Processing {question}")]

In [None]:
gpt_4_turbo_cot_individual = [[] for _ in range(len(questions))]

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {}
    for i, question in enumerate(questions):
        futures[question] = []
        for _, row in train_set.iterrows():
            future = executor.submit(generate_response, row, question, model="gpt-4-turbo")
            futures[question].append(future)
    
    for i, question in enumerate(questions):
        gpt_4_turbo_cot_individual[i] = [future.result() for future in tqdm(concurrent.futures.as_completed(futures[question]), total=len(futures[question]), desc=f"Processing {question}")]

In [None]:
gpt_4o_cot_individual = [[] for _ in range(len(questions))]

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {}
    for i, question in enumerate(questions):
        futures[question] = []
        for _, row in train_set.iterrows():
            future = executor.submit(generate_response, row, question, model="gpt-4o")
            futures[question].append(future)
    
    for i, question in enumerate(questions):
        gpt_4o_cot_individual[i] = [future.result() for future in tqdm(concurrent.futures.as_completed(futures[question]), total=len(futures[question]), desc=f"Processing {question}")]

In [None]:
gpt_35_individual_numeric = [get_last_numeric_values(lst) for lst in gpt_35_cot_individual]

In [None]:
gpt_4_turbo_individual_numeric = [get_last_numeric_values(lst) for lst in gpt_4_turbo_cot_individual]

In [None]:
gpt_4o_individual_numeric = [get_last_numeric_values(lst) for lst in gpt_4o_cot_individual]

In [None]:
for i in range(len(gpt_35_individual_numeric)):
    train_set[f"gpt_3.5_individual_CoT_q{i+1}_difficulty_score"] = gpt_35_individual_numeric[i]
    train_set[f"gpt_3.5_individual_CoT_q{i+1}_reasoning"] = gpt_35_cot_individual[i]

In [None]:
for i in range(len(gpt_4_turbo_individual_numeric)):
    train_set[f"gpt_4_turbo_individual_CoT_q{i+1}_difficulty_score"] = gpt_4_turbo_individual_numeric[i]
    train_set[f"gpt_4_turbo_individual_CoT_q{i+1}_reasoning"] = gpt_4_turbo_cot_individual[i]

In [None]:
for i in range(len(gpt_4o_individual_numeric)):
    train_set[f"gpt_4o_individual_CoT_q{i+1}_difficulty_score"] = gpt_4o_individual_numeric[i]
    train_set[f"gpt_4o_individual_CoT_q{i+1}_reasoning"] = gpt_4o_cot_individual[i]

In [None]:
train_set.to_csv(TRAIN_PATH, index=False)

### One shot prompting + CoT

### Pairwise comparisons

### Different relationships between beta and difficulty scores

In [None]:
import torch

def get_sigmoid_thresholding_beta(row, difficulty_key, threshold, factor):
    return torch.sigmoid(torch.tensor(row[difficulty_key] - threshold) * factor).item()

In [None]:
min_value = train_set['gpt-3.5_CoT_AG_mean_difficulty_score'].min()
max_value = train_set['gpt-3.5_CoT_AG_mean_difficulty_score'].max()

train_set['gpt-3.5_CoT_AG_flipped_mean_difficulty_score'] = 1 - train_set['gpt-3.5_CoT_AG_mean_difficulty_score'] - min_value) / (max_value - min_value)

In [None]:
import itertools
factors = [3, 10, 30]
thresholds = [0.5, 0.7]
combinations = list(itertools.product(factors, thresholds))


for factor, threshold in combinations:
    train_set[f"gpt-3.5_CoT_AG_mean_sigmoid-{threshold}_{factor}"] = train_set.apply(lambda row: 
                                                                    get_sigmoid_thresholding_beta(row, 
                                                                                                "gpt-3.5_CoT_AG_flipped_mean_difficulty_score",
                                                                                               threshold, factor), axis=1)


In [None]:
cols = [col for col in train_set.columns if "gpt-3.5_CoT_AG_mean_sigmoid" in col]

In [None]:
plt.hist(train_set["gpt-3.5_CoT_AG_mean_sigmoid-0.7_30"])

In [None]:
train_set.to_csv(TRAIN_PATH, index=False)

## Did the annotators make decisions based on difficulty?
Running logistic regression between a binary variable representing whether or not the correct answer was chosen and the difficulty metric

Create dataframes for comparison below:
- only correct and incorrect answers
- correct and incorrect answers when they are of the same length

In [None]:
correct_incorrect_answer_pairs = train_set[train_set["tag_IDs"].apply(lambda x: str(x)[0] != str(x)[2])]
correct_incorrect_same_length_pairs = correct_incorrect_answer_pairs[correct_incorrect_answer_pairs["tag_IDs"].apply(lambda x: str(x)[1] == str(x)[3])]
correct_incorrect_diff_length_pairs = correct_incorrect_answer_pairs[correct_incorrect_answer_pairs["tag_IDs"].apply(lambda x: str(x)[1] != str(x)[3])]
correct_concise_incorrect_detailed = correct_incorrect_diff_length_pairs[correct_incorrect_diff_length_pairs["tag_IDs"].apply(lambda x: str(x)[0:2] == "11" or str(x)[2:4] == "11")]
correct_detailed_incorrect_concise = correct_incorrect_diff_length_pairs[correct_incorrect_diff_length_pairs["tag_IDs"].apply(lambda x: str(x)[0:2] == "12" or str(x)[2:4] == "12")]

In [None]:
labels = ["All Correct-Incorrect Pairs", "Correct-Incorrect Pairs of Same Length", "Correct-Incorrect Pairs of Diff. Length",  "Correct Concise, Incorrect Detailed", "Correct Detailed, Incorrect Concise"]
datasets = [correct_incorrect_answer_pairs, correct_incorrect_same_length_pairs, correct_incorrect_diff_length_pairs, correct_concise_incorrect_detailed, correct_detailed_incorrect_concise]
cols = [col for col in list(train_set.columns) if "difficulty" in col and "ground_truth" not in col and "judge" not in col]
# cols.remove("ground_truth_difficulty")
losses = [[] for i in range(len(cols))]
data_dict = dict(zip(cols, losses))

In [None]:
def generate_logistic_reg_score(dataset, x_col, y_col="correct_chosen", plot_title=None):
    # X = (dataset[x_col].values >= 4).astype(float)[:, None]
    X = dataset[x_col].values[:, None]
    y = dataset[y_col]
    
    model = LogisticRegression()
    model.fit(X, y)
    
    probabilities = model.predict_proba(X)
    loss = log_loss(y, probabilities) if model.coef_ < 0 else math.log(2)

    if plot_title:
        plt.hist(X[y == 0], alpha=0.5, label="incorrect")
        plt.hist(X[y == 1], alpha=0.5, label="correct")
        plt.title(plot_title)
        plt.legend()
        plt.show()

    return loss

In [None]:
for col, lst in data_dict.items():
    for dataset in datasets:
        try:
            lst.append(generate_logistic_reg_score(dataset, col))
        except:
            print(col)
            continue

In [None]:
data_dict["Labels"] = labels

In [None]:
df = pd.DataFrame(data_dict)
df_transposed = df.transpose()
new_header = df_transposed.iloc[-1]  
df_transposed = df_transposed[:-1]  
df_transposed.columns = new_header 

In [None]:
def highlight_min(s):
    is_min = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_min]

styled_df = df_transposed.style.apply(highlight_min)

styled_df

In [None]:
from statsmodels.stats.proportion import proportion_confint

correct_bins = []
counts = []
confints = []
col = "gpt-3.5_CoT_AG_mean_sigmoid-0.5_30"
dataset = correct_incorrect_answer_pairs
# dataset = correct_incorrect_same_length_pairs

# scores = np.arange()
scores = np.arange(0, 1.01, 0.333333333333333)

for score in scores:
    mask = np.abs(dataset[col] - score) < 0.15
    count = mask.sum()
    counts.append(count)
    correct = int(dataset["correct_chosen"].values[mask].sum())
    confint = proportion_confint(count=correct, nobs=count, alpha=0.1)
    correct_bins.append(correct / count)
    confints.append(confint)
    
# X = (correct_incorrect_answer_pairs[col].values >= 4).astype(float)[:, None]
X = dataset[col].values[:, None]
y = dataset["correct_chosen"]

model = LogisticRegression()
model.fit(X, y)

s = np.linspace(scores.min(), scores.max(), 100)
probabilities = model.predict_proba(s[:, None])

plt.bar(
    scores,
    correct_bins,
    yerr=np.abs(np.array(confints) - np.array(correct_bins)[:, None]).transpose(),
    width = (scores[1] - scores[0]) / 2
)
plt.plot(s, probabilities[:, 1], c="k", ls="--")
plt.xticks(scores, labels=[f"{score:.1f} ({count})" for score, count in zip(scores, counts)])
plt.show()