In [71]:
import numpy as np
import pandas as pd
import json
import sklearn as sk
import matplotlib as plt
import warnings
warnings.filterwarnings('ignore')


In [72]:
def confusion_matrix(data, prompt):
    data = data[data["PromptType"]==prompt]
    return np.array([[len(data[data["model_bruteforce"]=="1"][data["human_bruteforce"]=="1"]), len(data[data["model_bruteforce"]=="1"][data["human_bruteforce"]=="0"])], [len(data[data["model_bruteforce"]=="0"][data["human_bruteforce"]=="1"]), len(data[data["model_bruteforce"]=="0"][data["human_bruteforce"]=="0"])]])

def correctness(data, prompt):
    data = data[data["PromptType"]==prompt]
    return len(data[data["correctness"]=="1"]) / len(data)

In [75]:
data = {}
summary = {}
models = ['o3', 'GeminiFlash', 'Qwen1', 'Qwen14', 'Qwen70', 'DSChat', 'DSReason']
for model in models:
    file_path = f"response_evaluation/Logic/SolutionSummary-{model}/resultsEvaluations_evaluatedbyo3-2025-04-16.jsonl"
    with open(file_path, 'r') as file:
        data[model] = [json.loads(line) for line in file]
    
    with open(f"response_evaluation/Logic/LogicAll-{model}{'Batch' if model == 'o3' else ''}/resultsEvaluations_evaluatedbyo3-2025-04-16.jsonl", 'r') as file:
        correctnessData = [json.loads(line) for line in file]
    
    prompts = ["basicprompt", "mathPrompt", "hintPrompt", "hint_prompt", "combinedhintPrompt"]
    summaries = pd.DataFrame(data[model])
    correctness = pd.DataFrame(correctnessData)
    # print(correctness.head())
    data[model] = correctness
    summary = {}
    # print(model, prompts)
    
    print("Adequate:", len(summaries[summaries["Summary"]=="1"]))
    print("Inadequate:", len(summaries[summaries["Summary"]=="0"]))
    
    for index, row in summaries.iterrows():
        summary[row["ID"]] = int(row["Summary"])

    for prompt in prompts:
        bscores = [[0, 0], [0, 0]]
        cscores = [[0, 0], [0, 0]]
        for index, row in correctness.iterrows():
            if row["PromptType"] == prompt:
                if (type(row["Response"]) != str or row["Response"] == None or row["Response"] == "NaN" or row["Response"] == "None" or row["Response"] == "" or row["model_bruteforce"] == "NULL" or row["Response"] is str and row["Response"].isspace()):
                        continue
                try:
                    cscores[summary[int(row["ID"])]][int(row["correctness"])] += 1
                    bscores[summary[int(row["ID"])]][int(row["model_bruteforce"])] += 1
                except:
                    # print("failed", row["ID"], row["correctness"], row["model_bruteforce"])
                    pass
        # print(cscores, bscores)
        try:
            print("Model", model, "Prompt:", prompt, 
            f'{round(100*cscores[1][1]/(cscores[1][0]+cscores[1][1]), 1)}/{round(100*cscores[0][1]/(cscores[0][0]+cscores[0][1]), 1)}',
            # f'{round(100*bscores[1][1]/(bscores[1][0]+bscores[1][1]), 1)}/{round(100*bscores[0][1]/(bscores[0][0]+bscores[0][1]), 1)}',
              )
        except:
            pass
    # for j in range(4)
    #     print(round(100*scores[0][j][1]/(scores[0][j][0]), 1), round(100*scores[1][j][1]/(scores[1][j][0]), 1)) 

Adequate: 190
Inadequate: 55
Model o3 Prompt: basicprompt 83.5/90.2
Model o3 Prompt: mathPrompt 84.8/85.4
Model o3 Prompt: hint_prompt 88.1/77.5
Model o3 Prompt: combinedhintPrompt 88.2/80.0
Adequate: 158
Inadequate: 65
Model GeminiFlash Prompt: basicprompt 59.6/48.0
Model GeminiFlash Prompt: mathPrompt 59.1/59.2
Model GeminiFlash Prompt: hint_prompt 60.5/60.8
Model GeminiFlash Prompt: combinedhintPrompt 64.3/70.5
Adequate: 11
Inadequate: 239
Model Qwen1 Prompt: basicprompt 36.4/2.5
Model Qwen1 Prompt: mathPrompt 9.1/3.8
Model Qwen1 Prompt: hintPrompt 36.4/5.4
Model Qwen1 Prompt: combinedhintPrompt 9.1/3.4
Adequate: 100
Inadequate: 150
Model Qwen14 Prompt: basicprompt 38.0/11.3
Model Qwen14 Prompt: mathPrompt 41.0/12.0
Model Qwen14 Prompt: hintPrompt 45.0/15.3
Model Qwen14 Prompt: combinedhintPrompt 38.0/18.0
Adequate: 111
Inadequate: 139
Model Qwen70 Prompt: basicprompt 33.3/17.3
Model Qwen70 Prompt: mathPrompt 36.0/15.1
Model Qwen70 Prompt: hintPrompt 39.6/15.1
Model Qwen70 Prompt: c

In [74]:
fulldata = pd.DataFrame()
for dataset in data.keys():
    fulldata = pd.DataFrame(data[dataset])
    prompts = fulldata["PromptType"].unique()
    
    # fig, axes = plt.pyplot.subplots(1, len(prompts), figsize=(30, 6))
    for i in range(len(prompts)):
        prompt = prompts[i]
        cmd = sk.metrics.ConfusionMatrixDisplay(confusion_matrix(fulldata, prompt), display_labels=["Used Brute Force", "Did Not Use Brute Force"])
        cmd.plot()
        cmd.ax_.set(xlabel='Human Solution', ylabel='Model Solution')
        cmd.ax_.set_title(f'{dataset} Brute Force Matrix for {prompt}')
    
        print(f"Correctness for {dataset} on {prompt}:", correctness(fulldata, prompt))

KeyError: 'human_bruteforce'

In [None]:
fig, axes = plt.pyplot.subplots(1, 5, figsize=(30, 6))
for i in range(len(prompts)):
    prompt = prompts[i]
    cmd = sk.metrics.ConfusionMatrixDisplay(confusion_matrix(fulldata, prompt), display_labels=["Used Brute Force", "Did Not Use Brute Force"])
    cmd.plot()
    cmd.ax_.set(xlabel='Human Solution', ylabel='Model Solution')
    cmd.ax_.set_title(f'Brute Force Matrix for {prompt}')

In [None]:
tempdata = fulldata[fulldata["PromptType"] == "basicprompt"]
# print(tempdata.head())
len(tempdata[tempdata["model_bruteforce"]=="1"][tempdata["human_bruteforce"]=="0"])