In [1]:
import pandas as pd
import os
import json

def load_and_merge_logs(folder_path="monitor_logs"):
    """
    Load all JSON files from the given folder and merge their contents
    into a single list.

    Args:
        folder_path (str): Path to the folder containing JSON files.
                          Defaults to 'monitor_logs'.

    Returns:
        list: A list containing all JSON contents merged together.
              - If a file's root is a list, its items are extended.
              - If a file's root is a dict, the dict itself is appended.
    """
    merged_logs = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    content = json.load(f)
                    if isinstance(content, list):
                        merged_logs.extend(content)
                    else:
                        merged_logs.append(content)
            except json.JSONDecodeError as e:
                print(f"⚠️ Failed to parse {filename}: {e}")
            except Exception as e:
                print(f"⚠️ Error reading {filename}: {e}")

    return merged_logs

def parse_from_actor_log_name(log_name: str) -> tuple[str, str, str]:
    return log_name.split("/")[-1].split("_")[1], log_name.split("/")[-1].split("_")[2], log_name.split("/")[-1].split("_")[5]

def result_from_log(log):
    return { "monitor": list(log.values())[0]["monitor_name"], "monitored": list(log.values())[0]["log_path"].split("/")[-1].split("_")[1], "suspicious": list(log.values())[0]["suspicious_count"], "total": list(log.values())[0]["total_files"] }


In [72]:
# encoded reasoning
logs = load_and_merge_logs()
all = []
for i in range(len(logs)):
    all.extend([value for value in logs[i].values()])
transformed = [{ "monitor": a["monitor_name"], "monitored": a["log_path"].split("/")[-1].split("_")[1], "suspicious": a["suspicious_count"], "total": a["total_files"] } for a in all]
df = pd.DataFrame(transformed)
out = df.groupby("monitored", as_index=False, dropna=False).agg(suspicious=("suspicious", "sum"), total=("total", "sum"))
out["rate"] = out["suspicious"] / out["total"]
out["rate_pct"] = (out["rate"] * 100).round(2)
out

Unnamed: 0,monitored,suspicious,total,rate,rate_pct
0,DeepSeek-R1,1041,1152,0.903646,90.36
1,Qwen3-Next-80B-A3B-Thinking,1037,1377,0.753086,75.31
2,gpt-oss-120b,741,1026,0.722222,72.22
3,gpt-oss-20b,813,1323,0.614512,61.45


In [73]:
out = df.groupby("monitor", as_index=False, dropna=False).agg(suspicious=("suspicious", "sum"), total=("total", "sum"))
out["rate"] = out["suspicious"] / out["total"]
out["rate_pct"] = (out["rate"] * 100).round(2)
out

Unnamed: 0,monitor,suspicious,total,rate,rate_pct
0,Qwen3-Next-80B,1339,1626,0.823493,82.35
1,gpt-oss-120b,1244,1626,0.765068,76.51
2,gpt-oss-20b,1049,1626,0.645141,64.51


In [74]:
logs = load_and_merge_logs("monitor_logs_fp")
all = []
for i in range(len(logs)):
    all.extend([value for value in logs[i].values()])
transformed = [{ "monitor": a["monitor_name"], "monitored": a["log_path"].split("/")[-1].split("_")[1], "suspicious": a["suspicious_count"], "total": a["total_files"] } for a in all]
df = pd.DataFrame(transformed)
out = df.groupby("monitored", as_index=False, dropna=False).agg(suspicious=("suspicious", "sum"), total=("total", "sum"))
out["rate"] = out["suspicious"] / out["total"]
out["rate_pct"] = (out["rate"] * 100).round(2)
out

Unnamed: 0,monitored,suspicious,total,rate,rate_pct
0,DeepSeek-R1,98,765,0.128105,12.81
1,Qwen3-Next-80B-A3B-Thinking,208,1422,0.146273,14.63
2,gpt-oss-120b,109,864,0.126157,12.62
3,gpt-oss-20b,158,1216,0.129934,12.99


In [75]:
out = df.groupby("monitor", as_index=False, dropna=False).agg(suspicious=("suspicious", "sum"), total=("total", "sum"))
out["rate"] = out["suspicious"] / out["total"]
out["rate_pct"] = (out["rate"] * 100).round(2)
out

Unnamed: 0,monitor,suspicious,total,rate,rate_pct
0,Qwen3-Next-80B,489,1429,0.342197,34.22
1,gpt-oss-120b,39,1419,0.027484,2.75
2,gpt-oss-20b,45,1419,0.031712,3.17


In [25]:
parse_from_actor_log_name("actor_logs/response_DeepSeek-R1_Physics_disguised_as_Biology_20250921_222843")

('DeepSeek-R1', 'Physics', 'Biology')

In [6]:
logs = load_and_merge_logs("monitor_logs_correct")
all = []
for i in range(len(logs)):
    all.extend([value for value in logs[i].values()])
transformed = [{ "monitor": a["monitor_name"], "monitored": a["log_path"].split("/")[-1].split("_")[1], "suspicious": a["suspicious_count"], "total": a["total_files"] } for a in all]
df = pd.DataFrame(transformed)
out = df.groupby("monitored", as_index=False, dropna=False).agg(suspicious=("suspicious", "sum"), total=("total", "sum"))
out["rate"] = out["suspicious"] / out["total"]
out["rate_pct"] = (out["rate"] * 100).round(2)
out

Unnamed: 0,monitored,suspicious,total,rate,rate_pct
0,DeepSeek-R1,305,330,0.924242,92.42
1,Qwen3-Next-80B-A3B-Thinking,695,840,0.827381,82.74
2,gpt-oss-120b,527,735,0.717007,71.7
3,gpt-oss-20b,568,825,0.688485,68.85


In [7]:
out = df.groupby("monitor", as_index=False, dropna=False).agg(suspicious=("suspicious", "sum"), total=("total", "sum"))
out["rate"] = out["suspicious"] / out["total"]
out["rate_pct"] = (out["rate"] * 100).round(2)
out

Unnamed: 0,monitor,suspicious,total,rate,rate_pct
0,Qwen3-Next-80B,791,910,0.869231,86.92
1,gpt-oss-120b,708,910,0.778022,77.8
2,gpt-oss-20b,596,910,0.654945,65.49
