In [1]:
import numpy as np
import pandas as pd
import json
import sklearn as sk
import matplotlib as plt
import warnings
warnings.filterwarnings('ignore')


In [2]:
def confusion_matrix(data, prompt):
    data = data[data["PromptType"]==prompt]
    return np.array([[len(data[data["model_bruteforce"]=="1"][data["human_bruteforce"]=="1"]), len(data[data["model_bruteforce"]=="1"][data["human_bruteforce"]=="0"])], [len(data[data["model_bruteforce"]=="0"][data["human_bruteforce"]=="1"]), len(data[data["model_bruteforce"]=="0"][data["human_bruteforce"]=="0"])]])

def correctness(data, prompt):
    data = data[data["PromptType"]==prompt]
    return len(data[data["correctness"]=="1"]) / len(data)

In [18]:
data = {}
summary = {}
models = ['DSChat', 'DSReason', 'GeminiFlash', 'GeminiPro', 'o3', 'Qwen1', 'Qwen14', 'Qwen70']
# models = ['Qwen1']
for model in models:
    file_path = f"response_evaluation/Math/SolutionSummary-{model}/resultsEvaluations_evaluatedbyo3-2025-04-16.jsonl"
    with open(file_path, 'r') as file:
        data[model] = [json.loads(line) for line in file]
    
    with open(f"response_evaluation/Math/FinalMath-{model}/resultsEvaluations_evaluatedbyo3-2025-04-16.jsonl", 'r') as file:
        correctnessData = [json.loads(line) for line in file]
    
    prompts = pd.DataFrame(data[model])["PromptType"].unique()
    summaries = pd.DataFrame(data[model])
    correctness = pd.DataFrame(correctnessData)
    data[model] = correctness
    summary[model] = summaries
    print(model)
    
    scores = np.zeros((2, 3, 2))

    print("Adequate:", len(summaries[summaries["Summary"]=="1"]))
    print("Inadequate:", len(summaries[summaries["Summary"]=="0"]))
    for num in range(250):
        try:
            adequateSummary = int(summaries[summaries["ID"]==num]["Summary"].values[0])
        # print("Adequate Summary:", adequateSummary)
        # print(correctness[correctness["ID"]==num])

            try:
                basic = int(correctness[correctness["PromptType"]=="basicprompt"][correctness["ID"]==num]["correctness"].values[0])
                scores[adequateSummary][0][:basic+1] += 1
            except:
                pass
            try:
                math = int(correctness[correctness["PromptType"]=="mathPrompt"][correctness["ID"]==num]["correctness"].values[0])
                scores[adequateSummary][1][:math+1] += 1
            except:
                pass
            try:
                hint = int(correctness[correctness["PromptType"]=="hintPrompt"][correctness["ID"]==num]["correctness"].values[0])
                scores[adequateSummary][2][:hint+1] += 1
            except:
                pass
        except:
            pass
        
        # scores[adequateSummary][0][basic] += 1
        # scores[adequateSummary][1][math] += 1
        # scores[adequateSummary][2][hint] += 1
    
    
    print(scores)

DSChat
Adequate: 210
Inadequate: 40
[[[ 40.  16.]
  [ 40.  13.]
  [  0.   0.]]

 [[210. 129.]
  [210. 121.]
  [  0.   0.]]]
DSReason
Adequate: 215
Inadequate: 35
[[[ 35.  15.]
  [ 35.  15.]
  [  0.   0.]]

 [[215. 151.]
  [214. 153.]
  [  0.   0.]]]
GeminiFlash
Adequate: 188
Inadequate: 49
[[[ 47.  28.]
  [ 48.  29.]
  [  0.   0.]]

 [[185. 129.]
  [183. 129.]
  [  0.   0.]]]
GeminiPro
Adequate: 22
Inadequate: 228
[[[221. 166.]
  [221. 170.]
  [  0.   0.]]

 [[ 22.  15.]
  [ 22.  14.]
  [  0.   0.]]]
o3
Adequate: 204
Inadequate: 42
[[[ 40.  31.]
  [ 42.  31.]
  [  0.   0.]]

 [[202. 166.]
  [202. 165.]
  [  0.   0.]]]
Qwen1
Adequate: 65
Inadequate: 185
[[[185.  22.]
  [185.  19.]
  [  0.   0.]]

 [[ 65.  19.]
  [ 65.  21.]
  [  0.   0.]]]
Qwen14
Adequate: 163
Inadequate: 87
[[[ 87.  23.]
  [ 87.  16.]
  [  0.   0.]]

 [[163.  83.]
  [163.  87.]
  [  0.   0.]]]
Qwen70
Adequate: 169
Inadequate: 81
[[[ 81.  20.]
  [ 81.  16.]
  [  0.   0.]]

 [[169.  85.]
  [169.  85.]
  [  0.   0.]]]


In [None]:
fulldata = pd.DataFrame()
for dataset in data.keys():
    fulldata = pd.DataFrame(data[dataset])
    prompts = fulldata["PromptType"].unique()
    
    # fig, axes = plt.pyplot.subplots(1, len(prompts), figsize=(30, 6))
    for i in range(len(prompts)):
        prompt = prompts[i]
        cmd = sk.metrics.ConfusionMatrixDisplay(confusion_matrix(fulldata, prompt), display_labels=["Used Brute Force", "Did Not Use Brute Force"])
        cmd.plot()
        cmd.ax_.set(xlabel='Human Solution', ylabel='Model Solution')
        cmd.ax_.set_title(f'{dataset} Brute Force Matrix for {prompt}')
    
        print(f"Correctness for {dataset} on {prompt}:", correctness(fulldata, prompt))

In [None]:
fig, axes = plt.pyplot.subplots(1, 5, figsize=(30, 6))
for i in range(len(prompts)):
    prompt = prompts[i]
    cmd = sk.metrics.ConfusionMatrixDisplay(confusion_matrix(fulldata, prompt), display_labels=["Used Brute Force", "Did Not Use Brute Force"])
    cmd.plot()
    cmd.ax_.set(xlabel='Human Solution', ylabel='Model Solution')
    cmd.ax_.set_title(f'Brute Force Matrix for {prompt}')

In [None]:
tempdata = fulldata[fulldata["PromptType"] == "basicprompt"]
# print(tempdata.head())
len(tempdata[tempdata["model_bruteforce"]=="1"][tempdata["human_bruteforce"]=="0"])