### Build LLM judge to choose which model is preferred

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from hellofresh_extractor.llm.StructuredGeminiCaller import StructuredGeminiCaller
from hellofresh_extractor.gsuite.drive.GoogleDriveHelper import GoogleDriveHelper
from hellofresh_extractor.llm.StructuredClaudeCaller import StructuredClaudeCaller
from hellofresh_extractor.llm.prompts import multimodal_system_prompt, multimodal_user_query, judge_compare_prompt
from hellofresh_extractor.llm.output_schemas import JudgeModel
import glob
from PIL import Image
from pillow_heif import register_heif_opener
from dotenv import load_dotenv
import time
import numpy as np
import pandas as pd
from pydantic import BaseModel, Field
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set_context("notebook")

In [None]:
load_dotenv()

In [None]:
this_path = os.getcwd()
images_path = os.path.join(this_path,"images")

In [None]:
images = glob.glob(os.path.join(images_path,"*.HEIC"))

In [None]:
dataset_compare = pd.read_csv("test_hello_fresh_recipes_joined.csv")

In [None]:
gemini_caller = StructuredGeminiCaller(
    api_key=os.environ.get("GEMINI_API_KEY"),
    model="gemini-2.5-flash-preview-04-17"
)

claude_caller = StructuredClaudeCaller(
    api_key=os.environ.get("ANTHROPIC_API_KEY"),
    model="claude-3-7-sonnet-latest"
)

In [None]:
dataset_compare.head()

In [None]:
all_meals = []
register_heif_opener()
t0 = time.time()
judge_result = defaultdict(list)
recipe_choices = ["recipe_local","recipe_gemini"]
for i, row in dataset_compare.iterrows():
    print("*"*30)
    print(f"At image {i}")
    image = row["image_id"]
    recipe_choice_A = np.random.choice(np.arange(len(recipe_choices)))
    recipe_choice_B = 0 if recipe_choice_A == 1 else 1
    recipe_A = row[recipe_choices[recipe_choice_A]]
    recipe_B = row[recipe_choices[recipe_choice_B]]

    choice_dict = {
        "A": recipe_choices[recipe_choice_A],
        "B": recipe_choices[recipe_choice_B]
    }

    print(choice_dict)
    
    open_image = Image.open(image).convert("RGB")

    model_A = f"""
    Model A's result was
    {recipe_A}
    """

    model_B = f"""
    Model A's result was
    {recipe_B}
    """
    
    # result = gemini_caller.invoke(
    #     system_message=judge_compare_prompt,
    #     input_content=[open_image, model_A, model_B, "Please judge the results according to the instructions"],
    #     output_schema=JudgeModel
    # )

    result = claude_caller.invoke(
        system_message=judge_compare_prompt,
        input_content=[open_image, model_A, model_B, "Please judge the results according to the instructions. Remember to always return valid JSON like this example {'winner':'A','reasoning':'Model A is the best'}"],
        output_schema=JudgeModel
    )

    if "structured_data" in result:
        analysis = result["structured_data"]
        judge_result['winner'].append(choice_dict.get(analysis["winner"],"tie"))
        judge_result['reasoning'].append(analysis["reasoning"])
        print(analysis)
        #df = convert_structured_result_to_df(analysis)
        #df["image_path"] = image
        #all_meals.append(df)
    else:
        print(f"Structured data field not found for result {result}")
t1 = time.time()
mean_process_time = (t1 - t0)/len(images)

In [None]:
judge_result_pd = pd.DataFrame(judge_result)

In [None]:
judge_result_pd.to_csv("test_hello_fresh_recipes_judge_result_Gemini.csv")

In [None]:
judge_result_pd = pd.read_csv("test_hello_fresh_recipes_judge_result_Claude.csv")

In [None]:
plt.figure(figsize=(6, 4))
ax = sns.countplot(
    x="winner",
    data=judge_result_pd,
    palette="pastel",
    edgecolor=".6"
)

# Add value labels on top of each bar
for p in ax.patches:
    count = int(p.get_height())
    ax.annotate(
        str(count),
        (p.get_x() + p.get_width() / 2, p.get_height()),
        ha='center', va='bottom',
        fontsize=12, fontweight='bold'
    )

ax.set_title("Judge Results: Claude as Judge", fontsize=16, fontweight='bold')
ax.set_xlabel("Winner", fontsize=14)
ax.set_ylabel("Count", fontsize=14)
sns.despine()
plt.tight_layout()
plt.show()