
# 05 - Safety & Refusal Analysis

Inspect refusal behavior and Perspective safety scores to understand how imitation outputs differ across toxic and non-toxic cases.



**Goals**
- Load the processed imitation + Perspective bundle (and optional variants) for safety analysis.
- Flatten Perspective attributes into analyzable columns alongside refusal flags.
- Summarize refusal rates, compare safety attributes, and surface high-toxicity examples for manual review.


In [None]:

from pathlib import Path
from typing import Dict, Optional, Sequence

import pandas as pd

from utils.data_io import load_df_list_pickle, flatten_conversation_bundles, describe_bundle

In [None]:
# Paths and toggles
PROJECT_ROOT = Path.cwd()
ASSETS_PROCESSED = PROJECT_ROOT / "assets" / "processed"

PERSPECTIVE_PATH = ASSETS_PROCESSED / "combat_threads_with_perspective.pkl"
PERSPECTIVE_LIST_PATH = ASSETS_PROCESSED / "combat_threads_with_perspective_list.pkl"
PERSPECTIVE_MORE_REFUSE_PATH = ASSETS_PROCESSED / "combat_threads_with_perspective_list_more_refuse_cleaned.pkl"
FLAT_SCORES_PATH = ASSETS_PROCESSED / "combat_threads_with_perspective_scores.parquet"

SOURCE_MODE = "perspective_list"  # options: perspective_list, more_refuse, base
HIGH_TOXICITY = 0.5  # label a turn as toxic when Perspective TOXICITY >= this
REFUSAL_CONVERSATION_THRESHOLD = 0.1  # used when bucketing conversations by refusal share
REFUSAL_TRUE_RATE_THRESHOLD = 0.67  # more_refuse bundle marks refusal when true_rate falls below this share

ASSETS_PROCESSED



### Asset manifest
List available inputs/variants and the derived artifact this notebook can emit.


In [None]:
def resolve_source(mode: str):
    if mode == "more_refuse" and PERSPECTIVE_MORE_REFUSE_PATH.exists():
        return PERSPECTIVE_MORE_REFUSE_PATH
    if mode == "perspective_list" and PERSPECTIVE_LIST_PATH.exists():
        return PERSPECTIVE_LIST_PATH
    return PERSPECTIVE_PATH


SOURCE_PATH = resolve_source(SOURCE_MODE)
manifest = [
    {
        "role": "source",
        "path": SOURCE_PATH,
        "note": "Bundle used for analysis (selected via SOURCE_MODE).",
    },
    {
        "role": "variant_optional",
        "path": PERSPECTIVE_LIST_PATH,
        "note": "Perspective dict + pre-extracted list vectors (default source when present).",
    },
    {
        "role": "variant_optional",
        "path": PERSPECTIVE_MORE_REFUSE_PATH,
        "note": "Same as above with extra refusal heuristics (refuse_add/true_rate).",
    },
    {
        "role": "fallback",
        "path": PERSPECTIVE_PATH,
        "note": "Base Perspective dicts without flattened score vectors.",
    },
    {
        "role": "output_optional",
        "path": FLAT_SCORES_PATH,
        "note": "Optional parquet of flattened Perspective scores for downstream notebooks.",
    },
]
manifest_df = pd.DataFrame(manifest).drop_duplicates(subset=["path"])
manifest_df["exists"] = manifest_df["path"].apply(lambda p: Path(p).exists())
manifest_df


### Load and flatten bundle
Load the selected bundle, preserve conversation grouping, and add a binary refusal flag.


In [None]:
if not SOURCE_PATH.exists():
    raise FileNotFoundError(f"Bundle not found: {SOURCE_PATH}")

bundle = load_df_list_pickle(SOURCE_PATH)
print(f"using source: {SOURCE_PATH.name}")
print("bundle summary:", describe_bundle(bundle))

flat = flatten_conversation_bundles(bundle)
if SOURCE_MODE == "more_refuse" and "true_rate" in flat.columns:
    flat["is_refusal"] = flat["true_rate"].fillna(0) < REFUSAL_TRUE_RATE_THRESHOLD
    print(
        f"refusal derived from true_rate<{REFUSAL_TRUE_RATE_THRESHOLD} "
        f"({flat['is_refusal'].sum()} flagged)"
    )
else:
    flat["is_refusal"] = ~flat["imm_1_check"].astype(bool)
    print(f"refusal derived from imm_1_check ({flat['is_refusal'].sum()} flagged)")
print("rows", len(flat))
print("columns", flat.columns.tolist())
flat.head()



### Perspective score extraction
Normalize Perspective output into per-attribute columns. If `perspective_ls` is present it is expanded; otherwise scores are pulled from the raw Perspective dicts.


In [None]:

PERSPECTIVE_ATTRIBUTES = [
    "AFFINITY_EXPERIMENTAL",
    "COMPASSION_EXPERIMENTAL",
    "CURIOSITY_EXPERIMENTAL",
    "IDENTITY_ATTACK",
    "IDENTITY_ATTACK_EXPERIMENTAL",
    "INSULT",
    "INSULT_EXPERIMENTAL",
    "NUANCE_EXPERIMENTAL",
    "PERSONAL_STORY_EXPERIMENTAL",
    "PROFANITY",
    "PROFANITY_EXPERIMENTAL",
    "REASONING_EXPERIMENTAL",
    "RESPECT_EXPERIMENTAL",
    "SEVERE_TOXICITY",
    "SEVERE_TOXICITY_EXPERIMENTAL",
    "SEXUALLY_EXPLICIT",
    "THREAT",
    "THREAT_EXPERIMENTAL",
    "TOXICITY",
    "TOXICITY_EXPERIMENTAL",
]


def extract_summary_scores(entry: Optional[dict], attributes: Sequence[str] = PERSPECTIVE_ATTRIBUTES) -> Dict[str, Optional[float]]:
    scores: Dict[str, Optional[float]] = {}
    for attr in attributes:
        key = f"persp_{attr.lower()}"
        value = None
        if isinstance(entry, dict):
            value = entry.get(attr, {}).get("summaryScore", {}).get("value")
        scores[key] = value
    span_end = None
    if isinstance(entry, dict):
        spans = entry.get(attributes[0], {}).get("spanScores", [])
        if spans:
            span_end = spans[0].get("end")
    scores["persp_span_end"] = span_end
    return scores


def perspective_row_to_dict(row: pd.Series) -> Dict[str, Optional[float]]:
    ls = row.get("perspective_ls")
    if isinstance(ls, (list, tuple)) and len(ls) >= len(PERSPECTIVE_ATTRIBUTES) + 1:
        scores = {f"persp_{attr.lower()}": val for attr, val in zip(PERSPECTIVE_ATTRIBUTES, ls[1:])}
        scores["persp_span_end"] = ls[0]
        return scores
    return extract_summary_scores(row.get("perspective"))


flat_reset = flat.reset_index(drop=True)
score_frame = pd.DataFrame(flat_reset.apply(perspective_row_to_dict, axis=1).tolist())
analysis_df = pd.concat([flat_reset, score_frame], axis=1)
score_columns = [c for c in analysis_df.columns if c.startswith("persp_")]
print("score columns", score_columns[:5], "...", len(score_columns))
analysis_df.head()



### Refusal overview
Basic refusal rates at utterance and conversation levels.


In [None]:
refusal_rate = analysis_df["is_refusal"].mean()
convo_refusal = analysis_df.groupby("conversation_idx")["is_refusal"].mean()
refusal_convo_share = (convo_refusal > REFUSAL_CONVERSATION_THRESHOLD).mean()
print({"refusal_rate": refusal_rate, "refusal_convo_share": refusal_convo_share})
convo_refusal.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])


### Perspective attribute comparison
Contrast mean Perspective scores between refused vs accepted turns.


In [None]:

score_cols = [c for c in analysis_df.columns if c.startswith("persp_") and c != "persp_span_end"]
means = analysis_df.groupby("is_refusal")[score_cols].mean().T
means = means.rename(columns={False: "accept_mean", True: "refuse_mean"})
means["delta_refuse_minus_accept"] = means["refuse_mean"] - means["accept_mean"]
means.sort_values("delta_refuse_minus_accept", ascending=False).head(10)



### Toxicity thresholds and examples
Flag high-toxicity turns and surface examples for manual inspection.


In [None]:

analysis_df["is_toxic"] = analysis_df["persp_toxicity"].fillna(0) >= HIGH_TOXICITY
ct = pd.crosstab(analysis_df["is_refusal"], analysis_df["is_toxic"], normalize="index")
print(ct)
print(analysis_df["persp_toxicity"].describe())

refused_examples = (
    analysis_df[analysis_df["is_refusal"]]
    .nlargest(5, "persp_toxicity")
    [["conversation_idx", "text", "imm_1", "persp_toxicity"]]
)
accepted_examples = (
    analysis_df[~analysis_df["is_refusal"]]
    .nlargest(5, "persp_toxicity")
    [["conversation_idx", "text", "imm_1", "persp_toxicity"]]
)
refused_examples, accepted_examples



### Optional export of flattened scores
Uncomment to materialize a flat Perspective score table for downstream notebooks.


In [None]:

# analysis_df.to_parquet(FLAT_SCORES_PATH, index=False)
# FLAT_SCORES_PATH
