In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import time
from collections import defaultdict
from evaluation_docstrings.llm.utils import load_secrets
from evaluation_docstrings.llm.config import costs
from evaluation_docstrings.plotting.utils import (
    generate_basic_analysis_cost_latency,
    generate_scores_barplot_dataset,
    generate_cohen_kappa_plot,
)
from evaluation_docstrings.prompting.evaluation_prompts import (
    DocStringEvaluationPrompt,
    DocStringEvaluation,
)
from evaluation_docstrings.llm.structured_output_callers import (
    AnthropicStructuredOutputCaller,
)
from evaluation_docstrings.task.utils import (
    format_result_for_evaluation,
    assemble_raw_match_scores,
)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

In [None]:
secrets = load_secrets()
MODEL = "claude-3-5-sonnet-latest"
claude_caller = AnthropicStructuredOutputCaller(api_key=secrets["ANTHROPIC_API_KEY"])

In [None]:
alignment_ground_truth.to_csv(
    "evaluation_docstrings/datasets/alignment_ground_truth.csv", index=False
)

In [None]:
ground_truth_dataset = pd.read_csv(
    "evaluation_docstrings/datasets/alignment_ground_truth.csv"
)
ground_truth_only = ground_truth_dataset[
    ["code_id", "gt_coverage_score", "gt_clarity_score", "gt_accuracy_score"]
]

In [None]:
def evaluation_loop(df, model, ground_truth=None):
    assert model in costs
    evaluation_result = defaultdict(list)

    for i, row in df.iterrows():
        code_id = row["code_id"]
        evaluation_message = format_result_for_evaluation(row)
        print(f"Evaluating response for {i} : code id {code_id}")

        t1 = time.time()
        try:
            if model in ["claude-3-5-sonnet-latest", "claude-3-5-haiku-latest"]:
                evaluation_response = claude_caller.invoke(
                    evaluation_message,
                    DocStringEvaluationPrompt,
                    DocStringEvaluation,
                    input_model_name=model,
                    temperature=0,
                )
                tokens = claude_caller.token_counter(evaluation_response)
            elif model in ["gpt-4o", "gpt-4o-mini"]:
                evaluation_response = open_ai_caller.invoke(
                    evaluation_message,
                    DocStringEvaluationPrompt,
                    DocStringEvaluation,
                    input_model_name=model,
                    temperature=0,
                )
                tokens = open_ai_caller.token_counter(evaluation_response)
            elif model in ["gemini-1.5-flash", "gemini-1.5-pro"]:
                evaluation_response = gemini_caller.invoke(
                    evaluation_message,
                    DocStringEvaluationPrompt,
                    DocStringEvaluation,
                    temperature=0,
                )
                tokens = gemini_caller.token_counter(evaluation_response)
            else:
                raise ValueError(f"Model name {model} not recognized")

            model_output = evaluation_response["model_dump"]
            input_tokens = tokens["input_tokens"]
            output_tokens = tokens["output_tokens"]
            total_cost = (
                costs[model]["input"] * input_tokens
                + costs[model]["output"] * output_tokens
            ) / 1e6
        except Exception as e:
            print(e)
        t2 = time.time()

        evaluation_result["code_id"].append(code_id)
        evaluation_result["code_type"].append(row["code_type"])
        evaluation_result["code_name"].append(row["code_name"])
        evaluation_result["input_code"].append(row["input_code"])
        evaluation_result["expert_docstrings"].append(row["expert_docstrings"])
        evaluation_result["eval_model_critique"].append(model_output["critique"])
        evaluation_result["accuracy_score"].append(int(model_output["accuracy"]))
        evaluation_result["coverage_score"].append(int(model_output["coverage"]))
        evaluation_result["clarity_score"].append(int(model_output["clarity"]))
        evaluation_result["process_time"].append(t2 - t1)
        evaluation_result["input_tokens"].append(input_tokens)
        evaluation_result["output_tokens"].append(output_tokens)
        evaluation_result["total_cost"].append(total_cost)

    evaluation_df = pd.DataFrame(evaluation_result)

    # join on ground truth labels if we have them
    if isinstance(ground_truth, pd.DataFrame):
        evaluation_df = evaluation_df.merge(ground_truth, on="code_id")

    return evaluation_df


# get data ready for barplot
def assemble_scores_barplot(scores_df, id_cols):
    melted_df = scores_df.melt(
        id_vars=id_cols,
        value_name="score",
        value_vars=[c for c in plot_df if c not in id_cols],
        var_name="score_type",
    )
    totals = (
        melted_df.groupby(["score_type", "code_type"])
        .agg({"score": "sum", "code_name": "count"})
        .reset_index()
        .rename(columns={"code_name": "total"})
    )
    totals["fraction"] = totals["score"] / totals["total"]
    return totals

In [None]:
claude_alignment_result.to_csv(
    "evaluation_docstrings/datasets/claude_sonnet35_alignment_result.csv", index=False
)

In [None]:
evaluation_result = pd.read_csv(
    "evaluation_docstrings/datasets/claude_sonnet35_alignment_result.csv"
)
generate_cohen_kappa_plot(evaluation_result, MODEL)

In [None]:
plot_df = assemble_raw_match_scores(
    evaluation_result,
    id_col="code_name",
    type_col="code_type",
    scores_columns={
        "accuracy_score": "gt_accuracy_score",
        "clarity_score": "gt_clarity_score",
        "coverage_score": "gt_coverage_score",
    },
)
barplot_df = generate_scores_barplot_dataset(
    plot_df, id_cols=["code_name", "code_type"]
)

fig, axis = plt.subplots(1, 1, figsize=(8, 5))
sns.barplot(data=barplot_df, x="score_type", y="fraction", ax=axis, hue="code_type")
axis.set_ylabel("Fraction matching")
axis.set_xlabel("Score type")
# Add an overall title for the figure
fig.suptitle(f"Fraction success for evaluation ({MODEL})", fontsize=16)
plt.savefig(f"{MODEL}_success_fraction.png")

In [None]:
fig = generate_basic_analysis_cost_latency(evaluation_result, model_name=MODEL)
plt.savefig(f"{MODEL}_evaluation_costs.png")