In [1]:
%load_ext autoreload
%autoreload 2 

from pathlib import Path
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pydantic
import seaborn as sns
from buttermilk import BM
from cmap import Colormap
from rich import print as rprint

from buttermilk import BM
import os

from hydra import initialize, compose
from omegaconf import OmegaConf

# Load config, specifying overrides for our particular job
with initialize(version_base=None, config_path="./conf"):
    cfg = compose(config_name='config',
                  overrides=["+data=[judger,drag]", "+step=ordinary", "+save=bq", "job=results_oneshot"])

bm = BM(cfg=cfg)


plt.rcParams["figure.dpi"] = 300
plt.rcParams["figure.figsize"] = (10, 8)
sns.set_context("notebook")
sns.set_style("darkgrid")
plt.rcParams["font.size"] = 14

rprint(OmegaConf.to_container(bm.cfg.data, resolve=True))



  from .autonotebook import tqdm as notebook_tqdm


[32m2024-11-05 09:16:06[0m [35mJ5HW6L4KT6[0m [34mbuttermilk[0m buttermilk.py[ 297] [1;30mINFO[0m {'message': "Logging setup for: 20241104T2316Z-YiS9-J5HW6L4KT6-suzor. Ready for data collection, saving log to Google Cloud Logs (Resource(type='generic_task', labels={'project_id': 'dmrc-platforms', 'location': 'us-central1', 'namespace': 'automod', 'job': 'results_oneshot', 'task_id': '20241104T2316Z-YiS9-J5HW6L4KT6-suzor'})). Default save directory for data in this run is: gs://dmrc-analysis/runs/automod/results_oneshot/20241104T2316Z-YiS9-J5HW6L4KT6-suzor", 'project': 'automod', 'job': 'results_oneshot', 'run_id': '20241104T2316Z-YiS9-J5HW6L4KT6-suzor', 'save_dir': 'gs://dmrc-analysis/runs/automod/results_oneshot/20241104T2316Z-YiS9-J5HW6L4KT6-suzor', 'ip': '159.196.210.27', 'node_name': 'J5HW6L4KT6', 'username': 'suzor'}
[32m2024-11-05 09:16:06[0m [35mJ5HW6L4KT6[0m [34mbuttermilk[0m buttermilk.py[ 305] [1;30mDEBUG[0m [32mButtermilk version is: 0.2.0[0m
Prompt flow ser

In [2]:
from buttermilk.runner.helpers import load_data, prepare_step_data

df = prepare_step_data(bm.cfg.data).sort_values(by=["timestamp"], ascending=False)
df


[32m2024-11-05 09:16:18[0m [35mJ5HW6L4KT6[0m [34mbuttermilk[0m buttermilk.py[ 376] [1;30mINFO[0m Query stats: Ran in 0:00:03.110136 seconds, cache hit: False, billed 10.49 MB, approx cost $5.2e-06.


INFO:buttermilk:Query stats: Ran in 0:00:03.110136 seconds, cache hit: False, billed 10.49 MB, approx cost $5.2e-06.


KeyError: "['content'] not in index"

In [None]:
sql = f"""SELECT * FROM `{cfg.data.destination}`"""
df = bm.run_query(sql)

source = df.source.explode()
df = df.drop(columns='source').join(source)

df.loc[:, "record_id"] = df.record_id.str.lower().replace(
    r"[^\d\w]", "", regex=True
)

df = df.dropna(subset='prediction')
df.loc[:, "record_id"] = df.record_id.str.lower().replace(
    r"[^\d\w]", "", regex=True
)
df.loc[:, 'prediction'] = df.prediction.apply(lambda x: pydantic.TypeAdapter(bool).validate_python(x) if pd.notna(x) else None)
df.loc[:, 'expected'] = df.expected.apply(lambda x: pydantic.TypeAdapter(bool).validate_python(x) if pd.notna(x) else None)

df = df.set_index(["record_id", "source", "step","model"])

df.loc[:, "step_info"] = df.step_info.apply(json.loads)

if 'agent_info' in df.columns:
    df.loc[:, "agent_info"] = df.agent_info.apply(json.loads)
if 'run_info' in df.columns:
    df.loc[:, "run_info"] = df.run_info.apply(json.loads)


df.sample(5)


# Show results from hatespeech prompts

In [None]:
from buttermilk.tools.metrics import Metriciser
m = Metriciser()
acc = m.evaluate_results(df, levels=["step","model","source"], groundtruth='expected', prediction='prediction')
acc


In [None]:
import datetime
from buttermilk.utils.gsheet import GSheet
today = datetime.date.today().strftime("%Y%m%d")
g = GSheet()
g.save_gsheet(acc, title=f'{today}_results', sheet_name='our prompt')


In [None]:
print(acc.reset_index(level='model').xs("drag queens - alt text", level='source')[['model','accuracy']].sort_values(by='accuracy', ascending=False).sort_index().to_markdown(floatfmt="0.2f", tablefmt="rounded_outline"))


In [None]:
tbl = acc.reset_index(level='model').xs("drag queens - alt text", level='source')[['model','accuracy']]
tbl = tbl.pivot(columns='model', values='accuracy')
print(tbl.sort_index().to_markdown(floatfmt="0.2f", tablefmt="rounded_outline"))


In [None]:
ours = df.xs('drag queens - alt text', level='source')

# reindex df by day, using date from  'timestamp'
ours['date'] = ours['timestamp'].dt.date
ours.reset_index().groupby('date').job_id.agg('count').plot()



In [None]:
heat = df.copy()
heat = heat.groupby(
    by=["record_id", "step", "agent", "correct"]
).agg(num=("timestamp", "nunique"))
heat = heat.unstack(level=["correct"]).fillna(0)
heat["accuracy"] = heat["num"][True] / (heat["num"][True] + heat["num"][False])

heat = heat[["accuracy"]]
heat.columns = ["accuracy"]

heat = heat.unstack("record_id")
heat.columns = heat.columns.droplevel()


# make a heatmap, proportional
fig = plt.subplots(figsize=(6,6))
ax = sns.heatmap(
    heat,
    cmap="viridis",
    linewidths=1,
    linecolor="white",
    fmt="0.0%",
    cbar=False,
    annot=True,
    annot_kws={"fontsize": 6},
)
_ = ax.set_title("Proportion of correct decisions")

plt.xticks(rotation=45, ha="right", rotation_mode="anchor", fontsize=8)
plt.yticks(fontsize=6)
plt.show()


# standard vs model

In [None]:
sql = """
WITH SCORES AS
(SELECT JSON_VALUE(tox.record, '$.id') AS example, JSON_VALUE(tox.record, '$.img') AS img, JSON_VALUE(tox.record, '$.caption') AS alt_text, reasons, scores, labels,
tox.id, tox.model, tox.timestamp, JSON_VALUE(tox.record, '$.expected') AS expected, tox.result,
tox.job, tox.source, (JSON_VALUE(tox.record, '$.expected')="true")=tox.result as correct, standard, process
FROM `dmrc-analysis.toxicity.indicator` tox
WHERE TIMESTAMP_TRUNC(timestamp, MONTH) >= TIMESTAMP(DATE_SUB(CURRENT_DATE(), INTERVAL 1 MONTH))
AND (LOWER(tox.source) = 'drag queens' or LOWER(tox.source) = 'osb')
AND timestamp >= '2024-04-05 00:00:00'
ORDER BY timestamp DESC)

SELECT * FROM SCORES"""

df = client.query(sql).to_dataframe()
df.loc[:, "expected"] = df["expected"].apply(lambda x: pydantic.TypeAdapter(bool).validate_python(x) if pd.notna(x) else None)
df.loc[df['standard']=='standard','standard'] = "HATESPEECH.FB"
df.sample(5)


In [None]:
heat = df[df["process"].isin(["rules.apply", "toxic"])]
heat.loc[:, 'standard'] = heat['standard'].str.replace('HATESPEECH.', 'hatespeech ').str.lower()
heat = heat.groupby(
    by=["source", "model", "standard", "correct"]
).agg(num=("timestamp", "nunique"))

heat = heat.unstack(level=["correct"]).fillna(0)
heat["accuracy"] = heat["num"][True] / (heat["num"][True] + heat["num"][False])
heat = heat[["accuracy"]]
heat.columns = ["accuracy"]

heat = heat.unstack("standard")
heat.columns = heat.columns.droplevel()
heat.sample(10)


In [None]:

# make a heatmap, proportional
fig, axes = plt.subplots(1, 2, figsize=(12, 4), dpi=144, sharex=True, sharey=True)
for i, source in enumerate(['Drag Queens', 'osb']):
    df_plot = heat.xs(source)

    ax = sns.heatmap(
        df_plot,
        cmap="viridis",
        linewidths=1,
        linecolor="white",
        fmt="0.0%",
        cbar=False,
        annot=True,
        annot_kws={"fontsize": 6},
        ax=axes[i]
    )
    _ = ax.set_title(f"Proportion of correct {source} decisions")

    _ = ax.set_xticks(ax.get_xticks(), ax.get_xticklabels(), rotation=15, ha='right', fontsize=10)
    _ = ax.set_xlabel(None)
    _ = ax.set_ylabel(None)

fig.subplots_adjust(bottom=-0.5)
plt.show()


keep: gpt4chaotic, gemini15pro, claude3opus,  claude3sonnet
hatespeech.gelber, hatespeech.gelber.simplified, hatespeech.fb, 

# show accuracy per example

In [None]:
heat = df.groupby(by=["example", "model", "standard", "process", "combination", "correct"]).agg(
    num=("timestamp", "nunique")
)

heat = heat.unstack(level=["correct"]).fillna(0)
heat['accuracy'] = heat['num'][True] / (heat['num'][True] + heat['num'][False])
heat = heat[['accuracy']]
heat.columns = ['accuracy']

heat = heat.reset_index(level=[1, 2, 3], drop=True)
heat = heat.unstack('combination')
heat.columns = heat.columns.droplevel()
heat.sample(10)


In [None]:
# make a heatmap, proportional
fig = plt.subplots(figsize=(12, 8))
ax = sns.heatmap(
    heat,
    cmap="viridis",
    linewidths=1,
    linecolor="white",
    fmt="0.0%",
    annot=False,
    annot_kws={"fontsize": 4},
)
_ = ax.set_title("Proportion of correct decisions")

plt.xticks(rotation=45, ha="right", rotation_mode="anchor", fontsize=6)
plt.yticks(fontsize=6)
plt.show()


# Plot performance across multiple examples

In [None]:
heat = df.groupby(by=[ "model", "standard", "expected", "result"]).agg(
    num=("timestamp", "nunique")
)

heat = heat.unstack(level=["result","expected"]).fillna(0)

heat = heat[heat.columns.sort_values()]
heat.columns = ["TN", "FN", "FP", "TP"]

# calculate overall accuracy
heat['accuracy'] = (heat['TP'] + heat['TN']) / heat.sum(axis='columns')

# calculate precision, recall, f1
heat["precision"] = heat["TP"] / (heat["TP"] + heat["FP"])
heat["recall"] = heat["TP"] / (heat["TP"] + heat["FN"])
heat["f1"] = (
    2 * (heat["precision"] * heat["recall"]) / (heat["precision"] + heat["recall"])
)

# distribution of performance
fig, axes = plt.subplots(1,4, figsize=(16,3))
ax = sns.histplot(heat["accuracy"], bins=20, ax=axes[0], color='pink')
ax = sns.histplot(heat["f1"], bins=20, ax=axes[1], color='purple')
ax = sns.histplot(heat["precision"], bins=20, ax=axes[2], color='r')
ax = sns.histplot(heat["recall"], bins=20, ax=axes[3], color='g')


In [None]:
df_plot = heat.unstack("model")[['f1', 'precision', 'recall']]

fig, axes = plt.subplots(3,1, figsize=(12, 10))

for ax, col in zip(axes, ['f1', 'precision', 'recall']):
    ax = sns.heatmap(
        df_plot[col],
        cmap="viridis",
        cbar=None,
        linewidths=1,
        linecolor="white",
        fmt="0.0%",
        annot=True,
        annot_kws={"fontsize": 12},
        ax=ax
    )
    _ = ax.set_title(f"{col} by model and prompt standard")

    _ = ax.set_xticks(ax.get_xticks(), ax.get_xticklabels(), rotation=15, ha='right', fontsize=10)
    _ = ax.set_xlabel(None)
    _ = ax.set_ylabel(None)

fig.subplots_adjust(bottom=-0.5)
plt.show()


## Check heatmap per llm

In [None]:


df_plot = df.reset_index().pivot_table(
    index=["example", "model"],
    columns="correct",
    values="timestamp",
    aggfunc="nunique",
)
df_plot["proportion"] = (df_plot[True] / (df_plot[True] + df_plot[False])).fillna(0)
df_plot = df_plot[["proportion"]].unstack(level=[1])


# make a heatmap, proportional

fig = plt.subplots(figsize=(12, 8))
ax = sns.heatmap(
    df_plot,
    cmap="viridis",
    linewidths=1,
    linecolor="white",
    fmt=".0%",
    annot=True,
    annot_kws={"fontsize": 4},
)
_ = ax.set_title("Proportion of correct decisions")

plt.xticks(rotation=45, ha="right", rotation_mode="anchor", fontsize=6)

plt.show()


# show select aggregated stats

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import numpy as np


fix, axes = plt.subplots(4, 4, figsize=(16, 12), sharex=True, sharey=True)
axes = axes.flatten()

for i, (idx, row) in enumerate(heat.iterrows()):
    ax = axes[i]
    cm = np.array([row[["TN", "FP"]].values, row[["FN", "TP"]].values])

    ax.set_title(f"{idx}")

    ax = sns.heatmap(cm, annot=False, fmt="0.0f", cmap="Blues", ax=ax, cbar=False)

    group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
    labels = [f"{v1}\n{v2}" for v1, v2 in zip(heat.columns, group_counts)]
    labels = np.asarray(labels).reshape(2, 2)
    for i, y in enumerate(labels):
        for j, x in enumerate(y):
            ax.text(x=j + 0.5, y=i + 0.5, s=x, ha="center", va="center", color="black")

# Adjust the space between subplots
plt.subplots_adjust(wspace=0.2, hspace=0.5)

plt.show()
