In [None]:
from unitxt import get_logger
from unitxt.api import evaluate, load_dataset
from unitxt.blocks import Task, TaskCard
from unitxt.inference import (
    IbmGenAiInferenceEngineParams,
    IbmGenAiInferenceEngine,
    HFPipelineBasedInferenceEngine
)
from unitxt.llm_as_judge import LLMAsJudge
from unitxt.loaders import LoadFromDictionary
from unitxt.templates import InputOutputTemplate, TemplatesDict
from unitxt.text_utils import print_dict
import yaml

logger = get_logger()
from unitxt import add_to_catalog

data = {
    "test": [
        {"conversation": "user: is eugene from walking dead really a scientist"},
        {"conversation": "user: is eugene from walking dead really a scientist"},
        {"conversation": "user: is eugene from walking dead really a scientist"},
    ]
}

predictions = [
    "In the Walking Dead series, Eugene Porter initially claims to be a scientist who knows the cure to the zombie plague. However, it is eventually revealed that he is not a scientist, but a high school science teacher who lied about knowing the cure to manipulate other survivors into helping him.", 
    "No, Eugene Porter, played by Josh McDermitt, is not a scientist in real life. In the Walking Dead TV series, he portrays a character who initially appears to be a scientist with knowledge of a cure for the zombie outbreak. However, it is later revealed that he is actually a high school science teacher who lied about his credentials to manipulate other survivors into taking him to Washington D.C.",
    "thank you for the question.",
    ]


In [None]:

config_filepath = '../prepare/templates/response_assessment/judges/config_judges.yaml'
with open(config_filepath, "r") as stream:
    configs = yaml.safe_load(stream)
# print(configs['metric_version_models'])

matrix = configs['metric_version_models']['relevance']
# print(matrix)

template_lst = [sublist[0] for sublist in matrix]
model_lst = [sublist[1] for sublist in matrix]

# from unitxt.processors import ToString
# add_to_catalog(
#     ToString('../prepare/templates/response_assessment/judges/relevance/ensemble_relevancy_v2.pkl'),
#     "templates.response_assessment.judges.relevance.ensemble",
#     overwrite=True,
# )


In [None]:
from unitxt.llm_as_judge import MultipleLLMAsJudges

platform = 'ibm_gen_ai'
gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=256)

inference_model_lst = []
for model_name in model_lst:
    inference_model = IbmGenAiInferenceEngine(model_name=model_name, parameters=gen_params)
    inference_model_lst.append(inference_model)

# llm_judge_metric = LLMAsJudge(inference_model=inference_model,template="templates.response_assessment.judges.relevance.v3",task="rating.single_turn",main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '')}{platform}",strip_system_prompt_and_format_from_inputs=False,)

llm_judge_metric = MultipleLLMAsJudges(
    inference_model = inference_model_lst,
    template = template_lst,
    task="rating.single_turn",
    # main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '')}{platform}",
    strip_system_prompt_and_format_from_inputs=False,
)


card = TaskCard(
    loader=LoadFromDictionary(data=data),
    task=Task(
        inputs={"conversation": "str"},
        outputs={},
        prediction_type="str",
        metrics=[llm_judge_metric],
    ),
    templates=TemplatesDict(
        {
            "simple": InputOutputTemplate(
                input_format="{conversation}",
                output_format="",
            )
        }
    ),
)

test_dataset = load_dataset(card=card, template_card_index="simple")["test"]
evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)

for instance in evaluated_dataset:
    print_dict(
        instance,
        keys_to_print=[
            "source",
            "prediction",
            "processed_prediction",
            "references",
            "score",
        ],
    )
