In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from collections import defaultdict
import copy
import pickle
import glob
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

In [None]:
def load_jsonl_pandas(file_path):
    with open(file_path, 'r') as f:
        lines = f.read().splitlines()

    df_inter = pd.DataFrame(lines, columns=['json_element'])
    df = df_inter['json_element'].apply(json.loads).to_list()
    return df

In [None]:
available_models = set()
scored_eval_data = defaultdict(list)

success_responses = []

eval_filenames = ((set(glob.glob("../results/strong_reject_responses/eval_all_responses*.json")) 
                 - set(glob.glob("../results/strong_reject_responses/*responses*checkpoint*")))

for fname in tqdm(eval_filenames):
    model_name = "-".join(fname.split("-")[1:])[:-5]   # extract the model name from the file name

    if "checkpoint" in model_name:
        continue
    
    available_models.add(model_name)
    
    data = load_jsonl_pandas(fname)
    for x in data:
        scored_eval_data["model_name"].append(model_name)
        scored_eval_data["forbidden_prompt"].append(x["forbidden_prompt"])
        scored_eval_data["category"].append(x["category"])
        scored_eval_data["jailbreak"].append(x["jailbreak"])
        scored_eval_data["jailbreak_key"].append((x["jailbreak"], x["forbidden_prompt"]))
        scored_eval_data["scores"].append(x["score"])
        scored_eval_data["mean_score"].append(np.mean([s for s in x["score"] if s is not None]))
        scored_eval_data["mean_success"].append(np.mean([s for s in x["score"] if s is not None]) >= 0.5)
        scored_eval_data["max_score"].append(max([s for s in x["score"] if s is not None]))
        scored_eval_data["max_success"].append(max([s for s in x["score"] if s is not None]) >= 0.5)
        scored_eval_data["combined_success"].append((scored_eval_data["max_score"][-1] == 1.0) and (scored_eval_data["mean_score"][-1] >= 0.5))


In [None]:
scored_eval_df = pd.DataFrame.from_dict(scored_eval_data)

In [None]:
scored_eval_df["model_name"].unique()

In [None]:
_df = scored_eval_df[["model_name", "jailbreak", "mean_score"]].groupby(["model_name", "jailbreak"]).agg("mean").reset_index()
plt.figure(figsize=(12, 18))
sns.heatmap(_df.pivot(index="jailbreak", columns="model_name", values="mean_score"), vmin=0.0, vmax=1.0)
plt.xlabel('')
plt.ylabel('')
plt.rcParams.update({'font.size': 21})
plt.savefig("expected_mean_score_by_type.pdf", format="pdf", bbox_inches="tight")


In [None]:
_df = scored_eval_df[["model_name", "jailbreak", "max_score"]].groupby(["model_name", "jailbreak"]).agg("mean").reset_index()
plt.figure(figsize=(12, 18))

sns.heatmap(_df.pivot(index="jailbreak", columns="model_name", values="max_score"), vmin=0.0, vmax=1.0)
plt.xlabel('')
plt.ylabel('')
plt.rcParams.update({'font.size': 21})

plt.savefig("expected_max_score_by_type.pdf", format="pdf", bbox_inches="tight")

In [None]:
success_by_jailbreak = []
for jailbreak in scored_eval_df["jailbreak"].unique():
    success_by_jailbreak.append((jailbreak, np.mean(scored_eval_df[scored_eval_df["jailbreak"] == jailbreak]["mean_score"])))
list(list(zip(*sorted(success_by_jailbreak, key=lambda x: -x[1])[:20]))[0])

In [None]:
with open("pairwise_knn_layer_dfs.pkl", "rb") as f:
    knn_layer_frac_dfs = pickle.load(f)

In [None]:
knn_layer_frac_dfs

In [None]:
success_keys = [
    "mean_score",
    "max_score",
]

_knn_layer_frac_dfs = copy.deepcopy(knn_layer_frac_dfs)

jailbreak_scores_corr = defaultdict(list)
for model_a, model_b in tqdm(zip(knn_layer_frac_dfs[0]["model_a"], knn_layer_frac_dfs[0]["model_b"])):
    jailbreak_scores_corr["same_family"].append(model_a[:4] == model_b[:4])
    for success_key in success_keys:
        model_a_scores_df = scored_eval_df[scored_eval_df["model_name"] == model_a][["jailbreak_key", "mean_score", "max_score"]].rename(
            columns={"mean_score": "mean_score_model_a", "max_score": "max_score_model_a"})
        model_b_scores_df = scored_eval_df[scored_eval_df["model_name"] == model_b][["jailbreak_key", "mean_score", "max_score"]].rename(
            columns={"mean_score": "mean_score_model_b", "max_score": "max_score_model_b"})
        joint_scores_df = pd.merge(model_a_scores_df, model_b_scores_df, on='jailbreak_key', how='inner')
        joint_scores_df[["jailbreak", "forbidden_prompt"]] = joint_scores_df["jailbreak_key"].to_list()

        jailbreak_scores_corr[success_key].append(
            joint_scores_df[success_key + "_model_a"].corr(joint_scores_df[success_key + "_model_b"]))
        
        for success_threshold in [0.5, 0.6, 0.7, 0.8, 0.9]:
            a_pred_b = roc_auc_score(joint_scores_df[success_key + "_model_b"] >= success_threshold, 
                                     joint_scores_df["mean_score" + "_model_a"])
            b_pred_a = roc_auc_score(joint_scores_df[success_key + "_model_a"] >= success_threshold, 
                                     joint_scores_df["mean_score" + "_model_b"])
            symm_pred = (a_pred_b + b_pred_a) / 2
            symm_diff = np.abs(a_pred_b - b_pred_a)
       
            jailbreak_scores_corr[success_key + "_symm_pred_" + str(success_threshold)].append(symm_pred)
            jailbreak_scores_corr[success_key + "_symm_diff_" + str(success_threshold)].append(symm_diff)

            a_transfer_b = np.mean(joint_scores_df[joint_scores_df[success_key + "_model_a"] >= success_threshold]["max_score" + "_model_b"])
            b_transfer_a = np.mean(joint_scores_df[joint_scores_df[success_key + "_model_b"] >= success_threshold]["max_score" + "_model_a"])

            symm_transfer = (a_transfer_b + b_transfer_a) / 2            
            jailbreak_scores_corr[success_key + "_symm_transfer_" + str(success_threshold)].append(symm_transfer)

for i in range(len(knn_layer_frac_dfs)):
    for success_key, vals_list in jailbreak_scores_corr.items():
        _knn_layer_frac_dfs[i][success_key] = vals_list


In [None]:
knn_layer_frac_dfs

In [None]:
corr_dict = defaultdict(dict)
corr_data = defaultdict(list)
for _df in _knn_layer_frac_dfs:
    for success_key in [k for k in _knn_layer_frac_dfs[0].keys() if ("mean" in k or "max" in k) and "diff" not in k]:
        corr_dict[_df["layer_frac"].unique()[0]][success_key] = _df["knn_iou"].corr(_df[success_key])
        corr_data["layer_frac"].append(_df["layer_frac"].unique()[0])
        corr_data["success_key"].append(success_key)
        corr_data["pairwise_success_corr"].append(_df["knn_iou"].corr(_df[success_key]))   
corr_df = pd.DataFrame.from_dict(corr_data)


In [None]:
sns.heatmap(corr_df.pivot(index="success_key", columns="layer_frac", values="pairwise_success_corr"))

In [None]:
sns.scatterplot(data=_knn_layer_frac_dfs[7], x='knn_iou', y='max_score_symm_pred_0.5')