In [None]:
import numpy as np
import pandas as pd

EVAL_METHOD = "st"

if EVAL_METHOD == "st":
    from sentence_transformers import SentenceTransformer, util

    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
else:
    from bert_score import score

project_slug = "japanese_loc_gemma-3-12b-it"

df = pd.read_csv(f"generated_captions/{project_slug}_captions.txt", sep="\t")

In [None]:
def compare_texts(gt, pred):
    if pd.isna(gt) or pd.isna(pred):
        return np.nan
    else:
        if EVAL_METHOD == "st":
            sim = util.pytorch_cos_sim(
                model.encode(gt, convert_to_tensor=True).cpu(),
                model.encode(pred, convert_to_tensor=True).cpu(),
            )
            return sim.item()
        else:
            _P, _R, F1 = score([gt], [pred], lang="en", rescale_with_baseline=True)
            return F1.tolist()[0]


df["image_only_scores"] = df.apply(
    lambda x: compare_texts(x["human"], x["model_from_image"]), axis=1
)

df["metadata_image_scores"] = df.apply(
    lambda x: compare_texts(x["human"], x["model_img+metadata"]), axis=1
)

df["description_image_scores"] = df.apply(
    lambda x: compare_texts(x["human"], x["collaborative"]), axis=1
)

In [None]:
results_df = pd.DataFrame(df)
results_df.to_csv(f"evaluations/{project_slug}_bertscore.tsv", sep="\t")

In [None]:
print("FOR EVALUATION METHOD", EVAL_METHOD)
print(
    "MEAN IMAGE-ONLY SIMILARITY:",
    np.nanmean(results_df["image_only_scores"]),
    "STDEV:",
    np.nanstd(results_df["image_only_scores"]),
)
print(
    "MEAN IMAGE+METADATA SIMILARITY:",
    np.nanmean(results_df["metadata_image_scores"]),
    "STDEV:",
    np.nanstd(results_df["metadata_image_scores"]),
)
print(
    "MEAN IMAGE+DESCRIPTION SIMILARITY:",
    np.nanmean(results_df["description_image_scores"]),
    "STDEV:",
    np.nanstd(results_df["description_image_scores"]),
)