In [None]:
import feedback_forensics as ff
import feedback_forensics.app.plotting.paper as paper_plot
import pathlib
from IPython.display import display, Latex

data_path = pathlib.Path("../feedback-forensics-results")
fig_save_path = pathlib.Path("./output/png")
tex_save_path = pathlib.Path("./output/tex")

# ensure save path exists
fig_save_path.mkdir(parents=True, exist_ok=True)
tex_save_path.mkdir(parents=True, exist_ok=True)

tex_app_save_path = pathlib.Path("./output/tex/appendix")
tex_app_save_path.mkdir(parents=True, exist_ok=True)


# save general latex preamble
with open(tex_save_path / "000_preamble.tex", "w") as f:
    f.write(paper_plot.get_latex_doc_preamble())

# example latex table
with open(tex_save_path / "999_example_table.tex", "w") as f:
    latex = []
    latex = paper_plot.add_table_preamble(latex, "Example Table")
    latex = paper_plot.add_table_postamble(latex)
    f.write("\n".join(latex))

main_pref_datasets = [
    [data_path / "allenai_multipref.json", "MultiPref"],
    [data_path / "chatbot_arena.json", "Chatbot Arena"],
    [data_path / "prism.json", "PRISM"],
]

datasets = [
    [pathlib.Path("../forensics-data/feedback-forensics-public-results/llama4_arena_vs_public_version.json"), "Llama 4 Arena vs Public"],
    *main_pref_datasets,
]

print(str(data_path / "allenai_multipref.json"))

cache = {}
for dataset_path, dataset_name in datasets:
    dataset = ff.DatasetHandler(cache=cache)
    dataset.add_data_from_path(dataset_path)

    overall_metrics = dataset.get_overall_metrics()
    annotator_metrics = dataset.get_annotator_metrics()

    dataset_key = dataset_path.name.split(".")[0]

    metric_name = "strength"
    strength_metrics = annotator_metrics[dataset_key]["metrics"][metric_name]

    kwargs = {}
    if dataset_name == "Llama 4 Arena vs Public":
        kwargs = {
            "top_title": "Traits stronger in arena relative to public model",
            "bottom_title": "Traits weaker in arena relative to public model",
        }
    else:
        kwargs = {
            "top_title": "Ten most encouraged personality traits",
            "bottom_title": "Ten least encouraged personality traits",
        }

    latex_table = paper_plot.get_latex_top_and_bottom_annotators(
        annotator_metrics=strength_metrics,
        metric_name=metric_name.capitalize(),
        **kwargs,
    )

    with open(tex_save_path / f"001_top_and_bottom_annotators_{dataset_key}.tex", "w") as f:
        f.write(latex_table)

    # appendix

    latex_table_app = paper_plot.get_latex_top_and_bottom_annotators(
        annotator_metrics=strength_metrics,
        metric_name=metric_name.capitalize(),
        top_n=10,
        bottom_n=10,
        **kwargs,
    )
    with open(tex_app_save_path / f"001_top_and_bottom_annotators_{dataset_key}.tex", "w") as f:
        f.write(latex_table_app)





In [None]:
# Comparison of preference datasets relative to each other

combined_dataset = ff.DatasetHandler(cache=cache)
combined_dataset.load_data_from_paths([str(dataset[0]) for dataset in main_pref_datasets])

pretty_names = {
    "allenai_multipref.json": "MultiPref",
    "chatbot_arena.json": "Chatbot Arena",
    "prism.json": "PRISM",
}

for i, metric in enumerate(["strength", "relevance", "cohens_kappa_randomized"]):
    for length in [5,20,40]:
        metrics_df = combined_dataset.get_annotator_metrics_df(metric_name=metric, index_col_name="Generate a response that...")
        metrics_df.rename(columns=pretty_names, inplace=True)
        latex_str = paper_plot.get_latex_table_from_metrics_df(
            metrics_df=metrics_df.head(length),
            title=f"Differences between datasets (metric: {metric})",
            first_col_width=0.3,
        )

        with open(tex_app_save_path / f"00{i}_cross_datasets_{metric}_top{length}.tex", "w", encoding="utf-8") as f:
            f.write(latex_str)

In [None]:
# Analysis of Arena data
import pandas as pd

dataset_name = "chatbot_arena.json"
dataset = ff.DatasetHandler(cache=cache)
dataset.add_data_from_path(data_path / dataset_name)

general_df = dataset.first_handler.df
values = [
    'Songwriting Prompts',
    'Resume and Cover Letter Writing',
    'Professional Email Communication',
]
dataset.split_by_col(col="narrower_category", selected_vals=values)

metrics_df = dataset.get_annotator_metrics_df(metric_name="strength", index_col_name="Generate a response that...")

latex_str = paper_plot.get_latex_table_from_metrics_df(
    metrics_df=metrics_df.head(5),
    title="Encouraged personality traits across writing domains in Chatbot Arena (Strength)",
)

with open(tex_save_path / "002_writing_tasks_arena.tex", "w", encoding="utf-8") as f:
    f.write(latex_str)

In [None]:
# Analysis of MultiPref data
import pandas as pd
import feedback_forensics as ff
import pathlib

cache = {}
data_path = pathlib.Path("../feedback-forensics-results")

dataset_name = "allenai_multipref.json"
dataset = ff.DatasetHandler(cache=cache)
dataset.add_data_from_path(data_path / dataset_name)

In [None]:
annotator_metadata = dataset.get_available_annotators()
special_annotators = {
    annotator_key: metadata
    for annotator_key, metadata in annotator_metadata.items()
    if metadata["variant"] in ["unknown", "human"]
    if "normal" in metadata["annotator_visible_name"] or "expert" in metadata["annotator_visible_name"] or "gpt4" in metadata["annotator_visible_name"]
}
special_annotators

dataset.set_annotator_cols(annotator_keys=list(special_annotators.keys()))
df = dataset.get_annotator_metrics_df(metric_name="strength", index_col_name="Generate a response that...")


In [6]:
rename_dict = {
    'allenai_multipref\n(unknown: expert_1_preferred_text)': 'Human Expert 2',
    'allenai_multipref\n(unknown: preferred_text_gpt4)': 'GPT-4-Turbo',
    'allenai_multipref\n(unknown: normal_0_preferred_text)': 'Human Regular 1',
    'allenai_multipref\n(unknown: normal_1_preferred_text)': 'Human Regular 2',
    'allenai_multipref\n(unknown: expert_0_preferred_text)': 'Human Expert 1',
}
# rename the columns
df.rename(rename_dict, inplace=True, axis=1)

# reorder the columns (experts, regular, gpt-4)
df = df[['Generate a response that...', 'Human Expert 1', 'Human Expert 2', 'Human Regular 1', 'Human Regular 2', 'GPT-4-Turbo', 'Max diff']]

latex_str = paper_plot.get_latex_table_from_metrics_df(
    metrics_df=df.head(5),
    title="Personality traits encouraged by different annotators on MultiPref (Strength)",
    first_col_width=0.15,
)

with open(tex_save_path / "003_cross_annotator_comparison_multipref.tex", "w", encoding="utf-8") as f:
    f.write(latex_str)



In [None]:
# plotting model analysis
import pandas as pd
import feedback_forensics as ff
import pathlib

results_path = pathlib.Path("exp/outputs/2025-05-15_13-46-39_mc_v2/results/070_annotations_train_ap.json")

dataset = ff.DatasetHandler(cache=cache)
dataset.add_data_from_path(results_path)

annotators = dataset.get_available_annotators()
model_anns = {
    k: v for k, v in annotators.items() if v["variant"] == "model_identity"
}

dataset.set_annotator_cols(annotator_keys=list(model_anns.keys()))
metrics_df = dataset.get_annotator_metrics_df(metric_name="strength", index_col_name="Generate a response that...")

# remove the 070_annotations_train_ap from each column name
metrics_df.columns = metrics_df.columns.str.replace("070_annotations_train_ap\n(Model: ", "").str.replace(")", "").str.replace("openrouter/", "").str.replace(" ", "-")

# rename max-diff to Max diff
metrics_df.rename(columns={"Max-diff": "Max diff"}, inplace=True)

# set all gpt-4o models to 0
metrics_df["openai/gpt-4o-2024-08-06"] = 0

latex_str = paper_plot.get_latex_table_from_metrics_df(
    metrics_df=metrics_df.head(5),
    title="Most diverging personality traits across models",
    first_col_width=0.15,
)

with open(tex_save_path / "004_model_comparison.tex", "w", encoding="utf-8") as f:
    f.write(latex_str)

In [None]:
list(metrics_df.head(5).columns)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

short_names = {
    'meta-llama/llama-3-70b-instruct': "Llama-3-70b",
    'meta-llama/llama-3.3-70b-instruct':  "Llama-3.3-70b",
    'meta-llama/llama-4-maverick': "Llama-4-Maverick",
    'mistralai/mistral-medium': "Mistral-Medium",
    'mistralai/mistral-medium-3': "Mistral-Medium-3",
    'openai/gpt-4.1-2025-04-14': "GPT-4.1",
    'openai/gpt-4o-2024-08-06': "GPT-4o",
    'meta-llama/llama-2-70b-chat': "Llama-2-70b",
    'openai/gpt-3.5-turbo': "GPT-3.5-Turbo",
    'mistralai/mistral-7b-instruct-v0.1': "Mistral-7b",
}


def plot_model_comparison_by_family(metrics_df, trait_index="makes more confident statements", ax=None, top_margin=0.25, bottom_margin=0.25):
    # Import adjustText for automatic label positioning
    from adjustText import adjust_text

    # Create axis if not provided
    if ax is None:
        fig, ax = plt.subplots(figsize=(3, 2))
    else:
        fig = ax.figure

    # Extract data for the plot
    data = metrics_df.loc[trait_index].drop("Max diff")
    # Group models by family
    model_families = {
        "Meta": sorted([col for col in data.index if "llama" in col.lower()]),
        "Mistral": sorted([col for col in data.index if "mistral" in col.lower()]),
        "OpenAI": sorted([col for col in data.index if "gpt" in col.lower()],
                  key=lambda x: 0 if "gpt-4o" in x.lower() else 1 if "gpt-4.1" in x.lower() else -1)
    }

    # Plot each model family with a different color

    Color1 = "#9eb0ff"  # Light blue
    Color2 = "#ffadad"  # Light red
    Color3 = "#84cb75"  # Light green

    colors = [Color1, Color2, Color3]
    markers = ['o', 's', '^']

    # Store text objects for adjustText and lines for objects to avoid
    texts = []
    line_objects = []
    x_coords = []
    y_coords = []

    # Calculate x positions for each family
    family_centers = []
    family_widths = []

    # First pass to determine x positions
    start_x = 0
    for i, (family, models) in enumerate(model_families.items()):
        if len(models) > 1:
            width = len(models)
        else:
            width = 1

        family_widths.append(width)
        family_centers.append(start_x + width/2)
        start_x += width + 1  # Add spacing between families

    for i, (family, models) in enumerate(model_families.items()):
        family_data = data[models]
        # Calculate x positions for this family
        if len(models) > 1:
            x = np.linspace(family_centers[i] - family_widths[i]/2 + 0.5,
                           family_centers[i] + family_widths[i]/2 - 0.5,
                           len(models))
        else:
            x = np.array([family_centers[i]])

        line, = ax.plot(x, family_data.values, marker=markers[i], linestyle='-',
                 color=colors[i], linewidth=2, markersize=8)
        line_objects.append(line)

        # Add model names as labels
        for j, model in enumerate(models):
            label = short_names[model]
            text_obj = ax.text(x[j], family_data.values[j],
                     label,
                     ha='center', va='center', fontsize=8)
            texts.append(text_obj)

            x_coords.append(x[j])
            y_coords.append(family_data.values[j])

    # Add vertical padding to the plot
    y_min, y_max = ax.get_ylim()
    y_range = y_max - y_min
    top_padding = top_margin * y_range
    bottom_padding = bottom_margin * y_range
    ax.set_ylim(y_min - bottom_padding, y_max + top_padding)


    # Add horizontal padding to the plot
    x_min, x_max = ax.get_xlim()
    x_range = x_max - x_min
    padding = x_range * 0.15  # 15% padding
    ax.set_xlim(x_min - padding, x_max + padding)

    ax.axhline(y=0, color='gray', linestyle='--', alpha=0.7)
    ax.grid(True, linestyle=':', alpha=0.7, axis='y')
    ax.set_ylabel(f'Strength of trait')


    # Set x-ticks at the center of each family group with family names
    ax.set_xticks(family_centers)
    ax.set_xticklabels(model_families.keys())
    ax.tick_params(axis='x', which='both', length=0)  # Make tick marks invisible but keep labels

    ax.set_yticks([], minor=True)
    ax.set_title(f'{trait_index.capitalize()}')

    ax.tick_params(right=False)

    # No legend needed as we're using x-axis labels

    # Use adjustText to automatically position labels without overlap
    adjust_text(texts,
                objects=line_objects,
                arrowprops=dict(arrowstyle='-', color='gray', lw=0.5),
                ax=ax,
                #expand=(1.2, 1.2),
                #avoid_self=True,
                force_text=(0.5, 0.4),
                #force_explode=(0.5, 0.5)
                time_lim=1,
    )

    plt.tight_layout()
    return fig

def plot_multiple_traits_side_by_side(metrics_df, traits, ncols=2, top_margin=0.25, bottom_margin=0.25):
    """
    Plot multiple trait comparisons side by side

    Parameters:
    -----------
    metrics_df : DataFrame
        The metrics dataframe
    traits : list
        List of traits to plot
    ncols : int
        Number of columns in the grid layout

    Returns:
    --------
    fig : Figure
        The matplotlib figure
    """
    nrows = (len(traits) + ncols - 1) // ncols  # Calculate number of rows needed
    fig, axes = plt.subplots(nrows, ncols, figsize=(2.5 * ncols, 2 * nrows))

    # Make axes iterable even if there's only one subplot
    if nrows * ncols == 1:
        axes = np.array([axes])
    axes = axes.flatten()

    # Plot each trait
    for i, trait in enumerate(traits):
        if i < len(axes):
            plot_model_comparison_by_family(metrics_df, trait, ax=axes[i], top_margin=top_margin, bottom_margin=bottom_margin)
            # Remove y label on the second plot in each row
            if i % ncols != 0:
                axes[i].set_ylabel('')

    # Hide any unused subplots
    for i in range(len(traits), len(axes)):
        axes[i].set_visible(False)

    plt.tight_layout()
    return fig

# Example usage - plot for confidence
traits = [
    "makes more confident statements",
    "has a friendlier tone",
    "provides a numbered list format",
    "ends with a follow-up question",
    "expresses more emotion",
]

traits = ['has more structured formatting', 'is more verbose',
       'makes more confident statements', "has a friendlier tone",
       'more strictly follows the requested output format',
       'is more factually correct', 'is more concise',
       'contains less harmful information', 'provides more examples',
       'provides a numbered list format',
       'is more polite', 'uses more formal language',
       'uses more personal pronouns (I, we, you)',
       'ends with a follow-up question', 'uses more bold and italics text',
       'actively engages the reader with rhetorical questions',
       'is more creative and original', 'refuses to answer the question',
       "compliments the user's question or prompt",
       'is more empathetic to the user', 'expresses more emotion',
       'agrees more with the user',
       'provides conclusions without full reasoning',
       'uses a more enthusiastic tone',
       'includes more references to other sources', 'is more optimistic',
       'more actively engages with the user',
       'includes more ethical considerations', 'uses more casual language',
       'uses more mathematical symbols and notation', 'uses more humour',
       'acknowledges own limitations or uncertainty more', 'uses more emojis',
       'has a more avoidant tone', 'includes inappropriate language',
       'is more offensive', 'suggests illegal activities',
       "reinforces user's beliefs more", "reinforces user's anger more",
       'agrees with user even if factually incorrect']

comparison_path = fig_save_path / "model_comparison"
comparison_path.mkdir(parents=True, exist_ok=True)

#for trait in traits:
#    fig = plot_model_comparison_by_family(metrics_df, trait)
#    fig.savefig(comparison_path / (trait.replace(" ", "_") + ".png"), dpi=300)


# Create double figures with two traits side by side
for i in range(0, len(traits), 2):
    if i + 1 < len(traits):  # Make sure we have a pair
        trait_group = [traits[i], traits[i+1]]


        fig = plot_multiple_traits_side_by_side(metrics_df, trait_group, ncols=2, top_margin=0.45, bottom_margin=0.15)
        fig.savefig(comparison_path / f"double_figure_{i//2}_topheavy.png", dpi=300)

        fig = plot_multiple_traits_side_by_side(metrics_df, trait_group, ncols=2, top_margin=0.55, bottom_margin=0.15)
        fig.savefig(comparison_path / f"double_figure_{i//2}_megatopheavy.png", dpi=300)

        fig = plot_multiple_traits_side_by_side(metrics_df, trait_group, ncols=2, top_margin=0.15, bottom_margin=0.45)
        fig.savefig(comparison_path / f"double_figure_{i//2}_bottomheavy.png", dpi=300)

        fig = plot_multiple_traits_side_by_side(metrics_df, trait_group, ncols=2, top_margin=0.20, bottom_margin=0.20)
        fig.savefig(comparison_path / f"double_figure_{i//2}_centered.png", dpi=300)

In [None]:
metrics_df.index


In [None]:
# get standard personality prompts

import inverse_cai.experiment.config.default_principles as dp

dp.DEFAULT_PRINCIPLES["v4"]

In [None]:
# plotting model analysis
import pandas as pd
import feedback_forensics as ff
import pathlib

results_path = pathlib.Path("exp/outputs/2025-05-15_13-46-39_mc_v2/results/070_annotations_train_ap.json")

dataset = ff.DatasetHandler(cache=cache)
dataset.add_data_from_path(results_path)

annotators = dataset.get_available_annotators()
model_anns = {
    k: v for k, v in annotators.items() if v["variant"] == "model_identity"
}

short_names = {
    'meta-llama/llama-3-70b-instruct': "Llama-3-70b",
    'meta-llama/llama-3.3-70b-instruct':  "Llama-3.3-70b",
    'meta-llama/llama-4-maverick': "Llama-4-Maverick",
    'mistralai/mistral-medium': "Mistral-Medium",
    'mistralai/mistral-medium-3': "Mistral-Medium-3",
    'openai/gpt-4.1-2025-04-14': "GPT-4.1",
    'openai/gpt-4o-2024-08-06': "GPT-4o",
    'meta-llama/llama-2-70b-chat': "Llama-2-70b",
    'openai/gpt-3.5-turbo': "GPT-3.5-Turbo",
    'mistralai/mistral-7b-instruct-v0.1': "Mistral-7b",
    'Max diff': "Max diff",
    "Generate-a-response-that...": "Generate a response that...",
}


for provider in ["Meta", "Mistral", "OpenAI"]:

    prov_model_anns = {key: ann for key, ann in model_anns.items() if provider.lower() in ann["model_id"].lower()}

    dataset = ff.DatasetHandler(cache=cache)
    dataset.add_data_from_path(results_path)

    dataset.set_annotator_cols(annotator_keys=list(prov_model_anns.keys()))
    metrics_df = dataset.get_annotator_metrics_df(metric_name="strength", index_col_name="Generate a response that...")

    # remove the 070_annotations_train_ap from each column name
    metrics_df.columns = metrics_df.columns.str.replace("070_annotations_train_ap\n(Model: ", "").str.replace(")", "").str.replace("openrouter/", "").str.replace(" ", "-")
    metrics_df.rename(columns={"Max-diff": "Max diff"}, inplace=True)

    print(metrics_df.columns)

    metrics_df.columns = metrics_df.columns.map(short_names)

    # sort column alphabetically
    metrics_df = metrics_df.reindex(sorted(metrics_df.columns), axis=1)
    # but keep the Generate a response that... column first
    metrics_df = metrics_df[["Generate a response that...", *[col for col in metrics_df.columns if col != "Generate a response that..."]]]
    # ensure the max diff column is last
    metrics_df = metrics_df[["Generate a response that...", *[col for col in metrics_df.columns if col != "Generate a response that..." and col != "Max diff"], "Max diff"]]

    print(metrics_df.columns)

    if "GPT-4o" in metrics_df.columns:
        # since this is the reference model, set it to 0 - otherwise different frame of reference
        metrics_df["GPT-4o"] = 0.0

    latex_str = paper_plot.get_latex_table_from_metrics_df(
        metrics_df=metrics_df.head(40),
        title=f"Personality traits across {provider} models",
        first_col_width=0.3,
    )

    with open(tex_app_save_path / f"010_model_comparison_{provider}.tex", "w", encoding="utf-8") as f:
        f.write(latex_str)