In [58]:
import pandas as pd
import json
import random
from openai import OpenAI
from time import sleep
from tqdm.notebook import tqdm
import numpy as np
import re
import pickle as pkl
import math
import os

import concurrent.futures
from functools import partial

import mwparserfromhell
import pandas as pd
import gensim.downloader as api
from gensim.corpora import WikiCorpus
from multiprocessing import Pool

import pickle

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 100000)
pd.set_option('display.max_colwidth', None)  

In [3]:
TRAIN_PATH = "train_llama3-70b_regenerated_detailed.csv"
train_df = pd.read_csv(TRAIN_PATH)
train_df = train_df[train_df["scratch_space"].notna()]

In [4]:
gpt_api_key = input("Enter your OpenAI API key: ")

gpt_client = OpenAI(
    api_key=gpt_api_key,
)
os.environ["OPENAI_API_KEY"] = gpt_api_key

Enter your OpenAI API key:  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


In [5]:
with open("difficulty_evaluation_prompt_cot_scratch_space.txt", "r") as f:
    autograder_prompt = f.read()

In [6]:
def generate_cot_output(question, response1, response2, scratch_space, model="gpt-3.5-turbo", temperature=0.0):
    chat_completion = gpt_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": autograder_prompt.format(question=question, response1=response1, response2=response2, scratch_space=scratch_space)
            }
        ],
        model=model,
        temperature=temperature
    )
    return chat_completion.choices[0].message.content

In [7]:
def generate_response(row, model="gpt-3.5-turbo"):
    response = generate_cot_output(row["questions"], row["choice1"], row["choice2"], row["scratch_space"], model=model)
    return response

In [8]:
gpt_35_cot = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(generate_response, row) for _, row in train_df.iterrows()]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        gpt_35_cot.append(future.result())

  0%|          | 0/65 [00:00<?, ?it/s]

In [94]:
gpt_4_turbo_cot = []

partial_generate_response = partial(generate_response, model="gpt-4-turbo")

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(partial_generate_response, row) for _, row in train_df.iterrows()]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        gpt_4_turbo_cot.append(future.result())

  0%|          | 0/65 [00:00<?, ?it/s]

In [101]:
def get_scores_and_reasonings(generated_array):
    question1_scores, question2_scores, question3_scores, question4_scores, question5_scores, question6_scores, question7_scores = ([] for _ in range(7))
    question1_reasonings, question2_reasonings, question3_reasonings, question4_reasonings, question5_reasonings, question6_reasonings, question7_reasonings = ([] for _ in range(7))
    
    scores_lists = [question1_scores, question2_scores, question3_scores, question4_scores, question5_scores, question6_scores, question7_scores]
    reasonings_lists = [question1_reasonings, question2_reasonings, question3_reasonings, question4_reasonings, question5_reasonings, question6_reasonings, question7_reasonings]
    
    errors = []

    for index, data in enumerate(generated_array):
        try:
            segments = re.split(r'(\d+\.[ab])', data)
            assert len(segments) == 29
            
            current_key = ''
            for segment in segments:
                if re.match(r'\d+\.[ab]', segment):
                    current_key = segment
                else:
                    if current_key and segment.strip():
                        question_number, part = current_key.split('.')
                        question_index = int(question_number) - 1
                        if part == 'a':
                            reasonings_lists[question_index].append(segment.strip())
                        elif part == 'b':
                            scores_lists[question_index].append(float(segment.strip()))
        except Exception as e:
            errors.append(index)
            print(e)
            for scores, reasonings in zip(scores_lists, reasonings_lists):
                scores.append(None)
                reasonings.append(None)

    return scores_lists, reasonings_lists, errors

In [102]:
gpt_35_scores_lists, gpt_35_reasonings_lists, errors = get_scores_and_reasonings(gpt_35_cot)

In [103]:
scores = [[] for _ in range(7)]
reasonings = [[] for _ in range(7)]

for output in gpt_35_cot:
    score, all_reasonings, errors = get_scores_and_reasonings([output])
    assert len(scores) == 7
    assert all(len(r) > 0 for r in all_reasonings)

    for i in range(7):
        scores[i].append(score[i][0])
        reasonings[i].append(all_reasonings[i][0])

question1_scores, question2_scores, question3_scores, question4_scores, \
question5_scores, question6_scores, question7_scores = scores

question1_reasonings, question2_reasonings, question3_reasonings, question4_reasonings, \
question5_reasonings, question6_reasonings, question7_reasonings = reasonings

In [104]:
for i in range(len(scores)):
    train_df[f"gpt-3.5_CoT_scratch_space_AG_question-{i+1}_difficulty_score"] = scores[i]
    train_df[f"gpt-3.5_CoT_scratch_space_AG_question-{i+1}_reasoning"] = reasonings[i]

In [None]:
columns = [f"gpt-3.5_CoT_AG_scratch_space_question-{i}_difficulty_score" for i in range(1, 7)]
train_set['gpt-3.5_CoT_AG_scratch_space_mean_difficulty_score'] = train_set[columns].mean(axis=1)
train_set['gpt-3.5_CoT_AG_scratch_space_max_difficulty_score'] = train_set[columns].max(axis=1)
train_set['gpt-3.5_CoT_AG_scratch_space_median_difficulty_score'] = train_set[columns].median(axis=1)

In [47]:
train_df.to_csv("scratch_space_" + TRAIN_PATH, index=False)

Logistic regression between difficulty and correct chosen

In [86]:
correct_incorrect_answer_pairs = train_df[train_df["tag_IDs"].apply(lambda x: str(x)[0] != str(x)[2])]
correct_incorrect_same_length_pairs = correct_incorrect_answer_pairs[correct_incorrect_answer_pairs["tag_IDs"].apply(lambda x: str(x)[1] == str(x)[3])]
correct_incorrect_diff_length_pairs = correct_incorrect_answer_pairs[correct_incorrect_answer_pairs["tag_IDs"].apply(lambda x: str(x)[1] != str(x)[3])]
correct_concise_incorrect_detailed = correct_incorrect_diff_length_pairs[correct_incorrect_diff_length_pairs["tag_IDs"].apply(lambda x: str(x)[0:2] == "11" or str(x)[2:4] == "11")]
correct_detailed_incorrect_concise = correct_incorrect_diff_length_pairs[correct_incorrect_diff_length_pairs["tag_IDs"].apply(lambda x: str(x)[0:2] == "12" or str(x)[2:4] == "12")]

In [87]:
labels = ["All Correct-Incorrect Pairs", "Correct-Incorrect Pairs of Same Length", "Correct-Incorrect Pairs of Diff. Length",  "Correct Concise, Incorrect Detailed"]
datasets = [correct_incorrect_answer_pairs, correct_incorrect_same_length_pairs, correct_incorrect_diff_length_pairs, correct_concise_incorrect_detailed]
cols = [col for col in list(train_df.columns) if "difficulty" in col and "ground_truth" not in col and "judge" not in col]
losses = [[] for i in range(len(cols))]
data_dict = dict(zip(cols, losses))

In [88]:
def generate_logistic_reg_score(dataset, x_col, y_col="correct_chosen", plot_title=None):
    # X = (dataset[x_col].values >= 4).astype(float)[:, None]
    X = dataset[x_col].values[:, None]
    y = dataset[y_col]
    # print(y)
    # print(X)
    
    model = LogisticRegression()
    model.fit(X, y)
    
    probabilities = model.predict_proba(X)
    loss = log_loss(y, probabilities) if model.coef_ < 0 else math.log(2)

    if plot_title:
        plt.hist(X[y == 0], alpha=0.5, label="incorrect")
        plt.hist(X[y == 1], alpha=0.5, label="correct")
        plt.title(plot_title)
        plt.legend()
        plt.show()

    return loss

In [89]:
for col, lst in data_dict.items():
    for i in range(len(datasets)):
        dataset = datasets[i]
        try:
            lst.append(generate_logistic_reg_score(dataset, col))
        except Exception as e:
            print(e)
            continue

In [90]:
data_dict["Labels"] = labels

In [91]:
df = pd.DataFrame(data_dict)
df_transposed = df.transpose()
new_header = df_transposed.iloc[-1]  
df_transposed = df_transposed[:-1]  
df_transposed.columns = new_header 

In [92]:
def highlight_min(s):
    is_min = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_min]

styled_df = df_transposed.style.apply(highlight_min)

styled_df

Labels,All Correct-Incorrect Pairs,Correct-Incorrect Pairs of Same Length,Correct-Incorrect Pairs of Diff. Length,"Correct Concise, Incorrect Detailed"
gpt-3.5_zero_shot_difficulty,0.693147,0.674963,0.693147,0.693147
gpt-4-turbo_zero_shot_difficulty,0.686877,0.637882,0.693147,0.693147
gpt-4o_zero_shot_difficulty,0.693147,0.693147,0.693147,0.693147
gpt-3.5_CoT_AG_question-1_difficulty_score,0.655753,0.609134,0.674784,0.670301
gpt-4o_CoT_AG_question-1_difficulty_score,0.693147,0.65435,0.693147,0.693147
gpt-4o_CoT_AG_question-2_difficulty_score,0.693147,0.676243,0.693147,0.693147
gpt-4o_CoT_AG_question-3_difficulty_score,0.693147,0.673709,0.693147,0.693147
gpt-4o_CoT_AG_question-4_difficulty_score,0.693147,0.690771,0.693147,0.693147
gpt-4o_CoT_AG_question-5_difficulty_score,0.693147,0.693147,0.693147,0.693147
gpt-4o_CoT_AG_question-6_difficulty_score,0.693147,0.569273,0.693147,0.693147
