In [1]:
import json
with open("CQ_Solver_summary_2.json", 'r', encoding='utf-8') as f:
    data = json.load(f)


times = []
for message in data:
    question_data = json.loads(message["question"])
    question = question_data["nodes"][0]["question"]
    # print(question)
    d = {}
    d["question"] = question
    update_count = 0
    question_decompose = 0
    conversations = message.get("conversations", {})
    
    for talk in conversations:
        if talk.get("agent", None) == "planner" and talk.get("role", None) == "assistant" and talk.get("tool_calls", None):
            if talk["tool_calls"][0]["function"]["name"] == "update":
                update_count += 1
            elif talk["tool_calls"][0]["function"]["name"] == "question_decompose":
                question_decompose += 1
        d["update_count"] = update_count
        d["question_decompose"] = question_decompose
    times.append(d)

total_update_count = sum(item['update_count'] for item in times)
total_question_decompose = sum(item['question_decompose'] for item in times)

average_update_count = total_update_count / len(times)
average_question_decompose = total_question_decompose / len(times)

print("Update Count - Total:", total_update_count, ", Average:", average_update_count)
print("Question Decompose - Total:", total_question_decompose, ", Average:", average_question_decompose)

question_to_id = {}
with open("final_experiment_results.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line)
        if item["system"] == "CQ_Solver" and item["model"] == "gpt-4o":
            question_to_id[item["question"]] = item["question_id"]
            
id_to_eval = {}
with open("final_evaluation_results.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line)
        if item["system"] == "CQ_Solver" and item["model"] == "gpt-4o":
            id_to_eval[item["question_id"]] = {
                "content": item["content"],
                "score": item["score"]
            }
            
for entry in times:
    question = entry["question"]
    question_id = question_to_id.get(question)
    if question_id and question_id in id_to_eval:
        entry["question_id"] = question_id
        entry["content"] = id_to_eval[question_id]["content"]
        entry["score"] = id_to_eval[question_id]["score"]
    else:
        entry["question_id"] = None
        entry["content"] = None
        entry["score"] = None
        
print(json.dumps(times))

Update Count - Total: 48 , Average: 0.48
Question Decompose - Total: 122 , Average: 1.22
[{"question": "how did depictions of indians in popular culture help to sway popular opinion?", "update_count": 1, "question_decompose": 1, "question_id": "question_111", "content": "The AI assistant's answer, while providing a reflection on the depiction of Indians in popular culture, deviates from the user's question, which specifically asks about how these depictions have swayed popular opinion regarding Native Americans. The AI assistant's interpretation of \"Indians\" as relating to people from India introduces factual inaccuracy into the response given the context of the reference answer.\n\n### Evaluation:\n\n1. **Factuality**: The assistant's response inaccurately addresses the term \"Indians,\" providing insights primarily relevant to Indian culture rather than Native Americans. Despite offering factual information about Indian stereotypes in media, it fails to directly relate to the user'

In [3]:
import json
import pandas as pd
from scipy.stats import spearmanr, mannwhitneyu

df = pd.DataFrame(times)

# 將 score dict 展平為欄位
score_df = pd.json_normalize(df['score'])
df = df.drop(columns='score').join(score_df)

df.head()

metrics = ['Factuality', 'User Satisfaction', 'Clarity', 'Logical Coherence', 'Completeness', 'Final Score']

# ========== Spearman Rank Correlation ==========
print("=== Spearman Rank Correlation ===")
for action in ['update_count', 'question_decompose']:
    print(f"\n--- Correlation with {action} ---")
    for metric in metrics:
        r, p = spearmanr(df[action], df[metric])
        print(f"{metric:20s} | ρ = {r:.3f} (p = {p:.3g})")

# ========== 分群檢定（以中位數切割 Final Score） ==========
# 建立高/低分群組
threshold = df['Final Score'].median()
df['high_score'] = df['Final Score'] >= threshold

print("\n=== Mann-Whitney U test: High vs. Low Final Score Group ===")
for action in ['update_count', 'question_decompose']:
    group_high = df[df['high_score']][action]
    group_low = df[~df['high_score']][action]
    stat, pval = mannwhitneyu(group_high, group_low, alternative='two-sided')
    print(f"{action:20s} | U = {stat:.3f}, p = {pval:.3g}")


=== Spearman Rank Correlation ===

--- Correlation with update_count ---
Factuality           | ρ = 0.106 (p = 0.293)
User Satisfaction    | ρ = -0.021 (p = 0.836)
Clarity              | ρ = -0.026 (p = 0.798)
Logical Coherence    | ρ = 0.038 (p = 0.704)
Completeness         | ρ = 0.098 (p = 0.334)
Final Score          | ρ = 0.052 (p = 0.611)

--- Correlation with question_decompose ---
Factuality           | ρ = 0.103 (p = 0.307)
User Satisfaction    | ρ = 0.143 (p = 0.156)
Clarity              | ρ = -0.003 (p = 0.974)
Logical Coherence    | ρ = 0.057 (p = 0.576)
Completeness         | ρ = 0.097 (p = 0.339)
Final Score          | ρ = 0.119 (p = 0.237)

=== Mann-Whitney U test: High vs. Low Final Score Group ===
update_count         | U = 1325.000, p = 0.555
question_decompose   | U = 1350.000, p = 0.339


In [63]:
import pandas as pd

# 假設你有一個列表 data
df = pd.DataFrame(times)

score_df = df['score'].apply(pd.Series)

df = pd.concat([df, score_df], axis=1)


# 操作多但分數低
high_updates_low_score = df[(df['update_count'] + df['question_decompose'] >= 3) & (df['Final Score'] <= 6)]
low_updates_high_score = df[(df['update_count'] + df['question_decompose'] <= 1) & (df['Final Score'] >= 8)]

print(high_updates_low_score["question_id"])
print(low_updates_high_score["question_id"])

12      question_13
67      question_73
71      question_77
101    question_109
134    question_143
136    question_145
181    question_194
194    question_210
239    question_260
254    question_275
257    question_278
294    question_318
345    question_373
375    question_407
387    question_422
397    question_433
407    question_445
410    question_448
411    question_449
429    question_468
436    question_476
452    question_492
Name: question_id, dtype: object
47      question_50
133    question_142
142    question_151
220    question_239
237    question_258
307    question_332
361    question_392
Name: question_id, dtype: object


In [4]:
import json
with open("final_llama_CQ_Solver_summary.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
    
# with open("../final_RQ3_summary.json", 'r', encoding='utf-8') as f:
#     data = json.load(f)


llama_data_beforeCD = []
for message in data:
    question_data = json.loads(message["question"])
    question = question_data["nodes"][0]["question"]
    # print(question)
    d = {}
    d["question"] = question
    update_count = 0
    question_decompose = 0
    conversations = message.get("conversations", {})
    
    for talk in conversations:
        if talk.get("agent", None) == "planner" and talk.get("role", None) == "assistant":
            # print(talk)
            if talk["content"][:18] == "Tool\'s name:update":
                # print(talk["content"])
                update_count += 1
            elif talk["content"][:30] == "Tool\'s name:question_decompose":
                question_decompose += 1
        d["update_count"] = update_count
        d["question_decompose"] = question_decompose
    llama_data_beforeCD.append(d)

total_update_count = sum(item['update_count'] for item in llama_data_beforeCD)
total_question_decompose = sum(item['question_decompose'] for item in llama_data_beforeCD)

average_update_count = total_update_count / len(llama_data_beforeCD)
average_question_decompose = total_question_decompose / len(llama_data_beforeCD)

print("Update Count - Total:", total_update_count, ", Average:", average_update_count)
print("Question Decompose - Total:", total_question_decompose, ", Average:", average_question_decompose)

question_to_id = {}
with open("final_experiment_results.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line)
        if item["system"] == "CQ_Solver" and item["model"] == "llama 3.3 70B":
            question_to_id[item["question"]] = item["question_id"]
            
id_to_eval = {}
with open("final_evaluation_results.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line)
        if item["system"] == "CQ_Solver" and item["model"] == "llama 3.3 70B":
            id_to_eval[item["question_id"]] = {
                "content": item["content"],
                "score": item["score"]
            }
            
for entry in llama_data_beforeCD:
    question = entry["question"]
    question_id = question_to_id.get(question)
    if question_id and question_id in id_to_eval:
        entry["question_id"] = question_id
        entry["content"] = id_to_eval[question_id]["content"]
        entry["score"] = id_to_eval[question_id]["score"]
    else:
        entry["question_id"] = None
        entry["content"] = None
        entry["score"] = None
        
# print(llama_data_beforeCD[0])

llama_data_beforeCD_df = pd.DataFrame(llama_data_beforeCD)

# 將 score dict 展平為欄位
score_df = pd.json_normalize(llama_data_beforeCD_df['score'])
llama_data_beforeCD_df = llama_data_beforeCD_df.drop(columns='score').join(score_df)

llama_data_beforeCD_df.head()

metrics = ['Factuality', 'User Satisfaction', 'Clarity', 'Logical Coherence', 'Completeness', 'Final Score']

# ========== Spearman Rank Correlation ==========
print("=== Spearman Rank Correlation ===")
for action in ['update_count', 'question_decompose']:
    print(f"\n--- Correlation with {action} ---")
    for metric in metrics:
        r, p = spearmanr(llama_data_beforeCD_df[action], llama_data_beforeCD_df[metric])
        print(f"{metric:20s} | ρ = {r:.3f} (p = {p:.3g})")

# ========== 分群檢定（以中位數切割 Final Score） ==========
# 建立高/低分群組
threshold = llama_data_beforeCD_df['Final Score'].median()
llama_data_beforeCD_df['high_score'] = llama_data_beforeCD_df['Final Score'] >= threshold

print("\n=== Mann-Whitney U test: High vs. Low Final Score Group ===")
for action in ['update_count', 'question_decompose']:
    group_high = llama_data_beforeCD_df[llama_data_beforeCD_df['high_score']][action]
    group_low = llama_data_beforeCD_df[~llama_data_beforeCD_df['high_score']][action]
    stat, pval = mannwhitneyu(group_high, group_low, alternative='two-sided')
    print(f"{action:20s} | U = {stat:.3f}, p = {pval:.3g}")

Update Count - Total: 429 , Average: 0.858
Question Decompose - Total: 603 , Average: 1.206
=== Spearman Rank Correlation ===

--- Correlation with update_count ---
Factuality           | ρ = -0.009 (p = 0.85)
User Satisfaction    | ρ = -0.025 (p = 0.57)
Clarity              | ρ = -0.018 (p = 0.687)
Logical Coherence    | ρ = -0.072 (p = 0.11)
Completeness         | ρ = -0.016 (p = 0.715)
Final Score          | ρ = -0.000 (p = 0.993)

--- Correlation with question_decompose ---
Factuality           | ρ = -0.039 (p = 0.382)
User Satisfaction    | ρ = -0.043 (p = 0.341)
Clarity              | ρ = -0.094 (p = 0.0358)
Logical Coherence    | ρ = -0.072 (p = 0.11)
Completeness         | ρ = -0.025 (p = 0.581)
Final Score          | ρ = -0.040 (p = 0.368)

=== Mann-Whitney U test: High vs. Low Final Score Group ===
update_count         | U = 25507.000, p = 0.491
question_decompose   | U = 24421.000, p = 0.66


In [9]:
import json
# with open("final_llama_CQ_Solver_summary.json", 'r', encoding='utf-8') as f:
#     data = json.load(f)
    
with open("../final_RQ3_summary.json", 'r', encoding='utf-8') as f:
    data = json.load(f)


llama_data_AfterCD = []
for message in data:
    question_data = json.loads(message["question"])
    question = question_data["nodes"][0]["question"]
    # print(question)
    d = {}
    d["question"] = question
    update_count = 0
    question_decompose = 0
    conversations = message.get("conversations", {})
    
    for talk in conversations:
        if talk.get("agent", None) == "planner" and talk.get("role", None) == "assistant":
            # print(talk)
            if talk["content"][:18] == "Tool\'s name:update":
                # print(talk["content"])
                update_count += 1
            elif talk["content"][:30] == "Tool\'s name:question_decompose":
                question_decompose += 1
        d["update_count"] = update_count
        d["question_decompose"] = question_decompose
    llama_data_AfterCD.append(d)

total_update_count = sum(item['update_count'] for item in llama_data_AfterCD)
total_question_decompose = sum(item['question_decompose'] for item in llama_data_AfterCD)

average_update_count = total_update_count / len(llama_data_AfterCD)
average_question_decompose = total_question_decompose / len(llama_data_AfterCD)

print("Update Count - Total:", total_update_count, ", Average:", average_update_count)
print("Question Decompose - Total:", total_question_decompose, ", Average:", average_question_decompose)

question_to_id = {}
with open("../final_RQ3_experiment_results.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line)
        if item["system"] == "CQ_Solver" and item["model"] == "llama 3.3 70B":
            question_to_id[item["question"]] = item["question_id"]
            
id_to_eval = {}
with open("../final_refined_evaluation_results.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line)
        if item["system"] == "CQ_Solver" and item["model"] == "llama 3.3 70B":
            id_to_eval[item["question_id"]] = {
                "content": item["content"],
                "score": item["score"]
            }
            
for entry in llama_data_AfterCD:
    question = entry["question"]
    question_id = question_to_id.get(question)
    if question_id and question_id in id_to_eval:
        entry["question_id"] = question_id
        entry["content"] = id_to_eval[question_id]["content"]
        entry["score"] = id_to_eval[question_id]["score"]
    else:
        entry["question_id"] = None
        entry["content"] = None
        entry["score"] = None
        
# print(llama_data_beforeCD[0])

llama_data_AfterCD_df = pd.DataFrame(llama_data_AfterCD)

# 將 score dict 展平為欄位
score_df = pd.json_normalize(llama_data_AfterCD_df['score'])
llama_data_AfterCD_df = llama_data_AfterCD_df.drop(columns='score').join(score_df)

llama_data_AfterCD_df.head()

metrics = ['Factuality', 'User Satisfaction', 'Clarity', 'Logical Coherence', 'Completeness', 'Final Score']

# ========== Spearman Rank Correlation ==========
print("=== Spearman Rank Correlation ===")
for action in ['update_count', 'question_decompose']:
    print(f"\n--- Correlation with {action} ---")
    for metric in metrics:
        r, p = spearmanr(llama_data_AfterCD_df[action], llama_data_AfterCD_df[metric])
        print(f"{metric:20s} | ρ = {r:.3f} (p = {p:.3g})")

# ========== 分群檢定（以中位數切割 Final Score） ==========
# 建立高/低分群組
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu, rankdata

def mannwhitney_detailed(group1, group2, group1_label="High", group2_label="Low", variable_name=""):
    data_all = np.concatenate([group1, group2])
    ranks = rankdata(data_all)

    # split ranks
    r1 = ranks[:len(group1)]
    r2 = ranks[len(group1):]

    u_stat, p_val = mannwhitneyu(group1, group2, alternative='two-sided')

    return {
        "變數": variable_name,
        "組別A": group1_label,
        "人數A": len(group1),
        "等級平均A": np.mean(r1),
        "等級總和A": np.sum(r1),
        "組別B": group2_label,
        "人數B": len(group2),
        "等級平均B": np.mean(r2),
        "等級總和B": np.sum(r2),
        "U 值": u_stat,
        "p 值": p_val
    }
    
    
threshold = llama_data_AfterCD_df['Final Score'].median()
llama_data_AfterCD_df['high_score'] = llama_data_AfterCD_df['Final Score'] >= threshold

# ========== 執行 Mann-Whitney 分析 ==========
results = []
for action in ['update_count', 'question_decompose']:
    group_high = llama_data_AfterCD_df[llama_data_AfterCD_df['high_score']][action].values
    group_low = llama_data_AfterCD_df[~llama_data_AfterCD_df['high_score']][action].values

    res = mannwhitney_detailed(group_high, group_low, variable_name=action)
    results.append(res)

# 輸出表格
mann_table = pd.DataFrame(results)
print("\n=== Mann-Whitney U 詳細表格 ===")
print(mann_table.to_string(index=False))

Update Count - Total: 428 , Average: 0.856
Question Decompose - Total: 594 , Average: 1.188
=== Spearman Rank Correlation ===

--- Correlation with update_count ---
Factuality           | ρ = -0.132 (p = 0.00307)
User Satisfaction    | ρ = -0.122 (p = 0.0064)
Clarity              | ρ = -0.016 (p = 0.726)
Logical Coherence    | ρ = -0.031 (p = 0.492)
Completeness         | ρ = -0.137 (p = 0.00216)
Final Score          | ρ = -0.169 (p = 0.00015)

--- Correlation with question_decompose ---
Factuality           | ρ = -0.008 (p = 0.85)
User Satisfaction    | ρ = -0.101 (p = 0.0238)
Clarity              | ρ = -0.069 (p = 0.125)
Logical Coherence    | ρ = -0.075 (p = 0.0932)
Completeness         | ρ = -0.074 (p = 0.0982)
Final Score          | ρ = -0.094 (p = 0.0362)

=== Mann-Whitney U 詳細表格 ===
                變數  組別A  人數A      等級平均A   等級總和A 組別B  人數B      等級平均B   等級總和B     U 值      p 值
      update_count High  380 241.931579 91934.0 Low  120 277.633333 33316.0 19544.0 0.000205
question_deco

In [64]:
import pandas as pd

# 假設你有一個列表 data
df = pd.DataFrame(llama_data_AfterCD)

score_df = df['score'].apply(pd.Series)

df = pd.concat([df, score_df], axis=1)


# 操作多但分數低
high_updates_low_score = df[(df['update_count'] + df['question_decompose'] >= 3) & (df['Final Score'] <= 6)]
low_updates_high_score = df[(df['update_count'] + df['question_decompose'] <= 1) & (df['Final Score'] >= 8)]

print(high_updates_low_score["question_id"])
print(low_updates_high_score["question_id"])

19      question_20
21      question_23
33      question_36
88     question_100
90     question_103
105    question_118
106    question_119
114    question_129
149    question_168
181    question_205
194    question_221
200    question_227
222    question_251
223    question_252
244    question_275
278    question_314
284    question_321
285    question_322
313    question_353
314    question_354
372    question_420
386    question_435
390    question_439
407    question_460
435    question_494
454    question_171
457    question_194
464    question_276
465    question_282
480    question_450
499    question_228
Name: question_id, dtype: object
53      question_60
77      question_86
164    question_185
221    question_250
274    question_309
368    question_416
373    question_421
391    question_440
396    question_446
Name: question_id, dtype: object


In [66]:
with open("../final_RQ3_summary.json", 'r', encoding='utf-8') as f:
    data = json.load(f)


print(data[250]["conversations"])


[{'agent': 'planner', 'role': 'system', 'content': 'You are a Planner Agent designed to reason through complex, open-ended, or ambiguous questions by constructing, reflecting on, and expanding a directed acyclic graph (DAG) of interrelated sub-questions. Your task is not simply to retrieve answers, but to actively explore the question space, refine your understanding, and make informed decisions about when the original question has been sufficiently addressed.\n\n---\n\n## Problem Space Representation: The Question DAG\n\nThe DAG is your evolving internal model of the problem. It represents your reasoning process — how the main question relates to sub-questions, intermediate knowledge, and reflections.\nEach node contains:\n- `node_id`: a unique identifier\n- `question`: a sub-question or original question\n- `annotation`: your current thoughts, insights, summaries, or hypotheses about that question\n\nEach annotation helps build and maintain your internal representation of the problem

# 顯著性統計檢定

🔍 Wilcoxon Signed-Rank Test — RQ: RQ1, Scorer: gpt-4o

➡ Comparing [ReAct] vs [MindSearch] under base model: gpt-4o
  ▶ Factuality          : statistic = 16805.5  p = 0.00008
  ▶ User Satisfaction   : statistic = 3095.0  p = 0.00000
  ▶ Clarity             : statistic = 10473.0  p = 0.00000
  ▶ Logical Coherence   : statistic = 11453.5  p = 0.00000
  ▶ Completeness        : statistic = 3161.0  p = 0.00000
  ▶ Final Score         : statistic = 3297.0  p = 0.00000

➡ Comparing [ReAct] vs [MindSearch] under base model: llama 3.3 70B
  ▶ Factuality          : statistic = 9824.0  p = 0.00000
  ▶ User Satisfaction   : statistic = 4548.5  p = 0.00000
  ▶ Clarity             : statistic = 16324.0  p = 0.00000
  ▶ Logical Coherence   : statistic = 16362.5  p = 0.00000
  ▶ Completeness        : statistic = 2781.0  p = 0.00000
  ▶ Final Score         : statistic = 4954.0  p = 0.00000


🔍 Wilcoxon Signed-Rank Test — RQ: RQ1, Scorer: llama 3.3 70B

➡ Comparing [ReAct] vs [MindSearch] under base model: gpt-4o
  ▶ Factuality          : statistic = 5248.0  p = 0.00000
  ▶ User Satisfaction   : statistic = 18437.5  p = 0.00000
  ▶ Clarity             : statistic = 5968.5  p = 0.00000
  ▶ Logical Coherence   : statistic = 8175.0  p = 0.00000
  ▶ Completeness        : statistic = 6337.5  p = 0.00000
  ▶ Final Score         : statistic = 9265.0  p = 0.00000

➡ Comparing [ReAct] vs [MindSearch] under base model: llama 3.3 70B
  ▶ Factuality          : statistic = 3121.0  p = 0.00000
  ▶ User Satisfaction   : statistic = 2167.0  p = 0.00000
  ▶ Clarity             : statistic = 8972.0  p = 0.00000
  ▶ Logical Coherence   : statistic = 5765.0  p = 0.00000
  ▶ Completeness        : statistic = 2222.5  p = 0.00000
  ▶ Final Score         : statistic = 2857.5  p = 0.00000

In [None]:
import json
import pandas as pd
from scipy.stats import wilcoxon

# === 可調參數 ===
RQ = "RQ1"  # or "RQ2"
scorer = "llama 3.3 70B"  # 評分者，用於選擇哪個 JSONL
jsonl_file = "final_evaluation_results.jsonl" if scorer == "gpt-4o" else "llama_rq1rq2_evaluation_results.jsonl"

# === 根據 RQ 選擇要比較的 system 對
if RQ == "RQ1":
    system_a = "ReAct"
    system_b = "MindSearch"  # Q-Decomp ReAct 用 MindSearch 存
elif RQ == "RQ2":
    system_a = "CQ_Solver"
    system_b = "MindSearch"

# === 評估的 base model（基底模型），即回答是由誰產生的
base_models = ["gpt-4o", "llama 3.3 70B"]

# === 評估指標
metrics = ["Factuality", "User Satisfaction", "Clarity",
           "Logical Coherence", "Completeness", "Final Score"]

# === 載入資料 ===
with open(jsonl_file, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

df = pd.DataFrame(data)

# === 分組分析 ===
print(f"🔍 Wilcoxon Signed-Rank Test — RQ: {RQ}, Scorer: {scorer}")
for model in base_models:
    # 過濾該基底模型下的 system a 與 b
    df_model = df[df['model'] == model]
    df_a = df_model[df_model['system'] == system_a].set_index('question_id')
    if system_b == "MindSearch" and model == "gpt-4o":
        # 特例：gpt-4o 當 base model，MindSearch 是參考答案，資料不存在
        df_b = pd.DataFrame(index=df_a.index)
        df_b["score"] = [{"Factuality": 8, "User Satisfaction": 8, "Clarity": 8,
                          "Logical Coherence": 8, "Completeness": 8, "Final Score": 8} for _ in range(len(df_a))]
    else:
        df_b = df_model[df_model['system'] ==
                        system_b].set_index('question_id')

    common_ids = sorted(df_a.index.intersection(df_b.index))

    print(
        f"\n➡ Comparing [{system_a}] vs [{system_b}] under base model: {model}")

    for metric in metrics:
        scores_a = df_a.loc[common_ids]['score'].apply(
            lambda x: x[metric]).astype(int)
        scores_b = df_b.loc[common_ids]['score'].apply(
            lambda x: x[metric]).astype(int)

        # 例外處理：如果是 GPT-4o 當 base model + MindSearch（參考答案），強制為 8 分
        if system_b == "MindSearch" and model == "gpt-4o":
            scores_b = pd.Series([8] * len(common_ids), index=common_ids)

        stat, p = wilcoxon(scores_a, scores_b)
        print(f"  ▶ {metric:<20}: statistic = {stat:<6}  p = {p:.5f}")

🔍 Wilcoxon Signed-Rank Test — RQ: RQ2, Scorer: llama 3.3 70B

➡ Comparing [CQ_Solver] vs [MindSearch] under base model: gpt-4o
  ▶ Factuality          : statistic = 442.0   p = 0.00000
  ▶ User Satisfaction   : statistic = 28072.0  p = 0.96512
  ▶ Clarity             : statistic = 1748.0  p = 0.00000
  ▶ Logical Coherence   : statistic = 836.0   p = 0.00000
  ▶ Completeness        : statistic = 15306.0  p = 0.00000
  ▶ Final Score         : statistic = 8436.0  p = 0.00000

➡ Comparing [CQ_Solver] vs [MindSearch] under base model: llama 3.3 70B
  ▶ Factuality          : statistic = 12637.5  p = 0.41401
  ▶ User Satisfaction   : statistic = 20507.5  p = 0.00221
  ▶ Clarity             : statistic = 17550.5  p = 0.35005
  ▶ Logical Coherence   : statistic = 24097.5  p = 0.75787
  ▶ Completeness        : statistic = 23269.0  p = 0.00003
  ▶ Final Score         : statistic = 21255.5  p = 0.00141


In [18]:
import json
import pandas as pd
from scipy.stats import wilcoxon

# === 可調參數 ===
scorer = "llama 3.3 70B"  # 可選 "gpt-4o" 或 "llama 3.3 70B"

# === 根據評分者切換資料來源 ===
if scorer == "gpt-4o":
    file_before = "final_evaluation_results.jsonl"
    file_after = "final_refined_evaluation_results.jsonl"
elif scorer == "llama 3.3 70B":
    file_before = "llama_rq1rq2_evaluation_results.jsonl"
    file_after = "llama_ver_RQ3_evaluation_results.jsonl"
else:
    raise ValueError("Invalid scorer")

# === 評估指標 ===
metrics = ["Factuality", "User Satisfaction", "Clarity", "Logical Coherence", "Completeness", "Final Score"]

# === 載入 JSONL 資料 ===
def load_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

data_before = load_data(file_before)
data_after = load_data(file_after)

# === 過濾 CQ_Solver (LLaMA 3.3 70B) 資料 ===
def filter_cqsolver(data):
    return {
        d['question_id']: d['score']
        for d in data
        if d['system'] == "CQ_Solver" and d['model'] == "llama 3.3 70B"
    }

scores_before = filter_cqsolver(data_before)
scores_after = filter_cqsolver(data_after)

# === 確保題號一致且排序
question_ids = sorted(set(scores_before.keys()) & set(scores_after.keys()))

# === 執行 Wilcoxon 檢定
print(f"🔍 RQ3 Wilcoxon Signed-Rank Test — Scorer: {scorer}")
for metric in metrics:
    vals_before = pd.Series([scores_before[q][metric] for q in question_ids])
    vals_after = pd.Series([scores_after[q][metric] for q in question_ids])
    stat, p = wilcoxon(vals_before, vals_after)
    print(f"  ▶ {metric:<20}: statistic = {stat:<7} p = {p:.5f}")


🔍 RQ3 Wilcoxon Signed-Rank Test — Scorer: llama 3.3 70B
  ▶ Factuality          : statistic = 11363.5 p = 0.42172
  ▶ User Satisfaction   : statistic = 21045.0 p = 0.01197
  ▶ Clarity             : statistic = 18662.5 p = 0.13150
  ▶ Logical Coherence   : statistic = 22293.5 p = 0.11214
  ▶ Completeness        : statistic = 25999.5 p = 0.01899
  ▶ Final Score         : statistic = 19407.5 p = 0.00697
