### Pairwise Evaluation
* 두 개 이상의 LLM 생성물 서로 비교한다.

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

def evaluate_pairwise(runs: list, example) -> dict:

    # 점수 저장
    scores = {}
    for i, run in enumerate(runs):
        scores[run.id] = i

    # 각 예제에 대한 실행 쌍
    answer_a = runs[0].outputs["answer"]
    answer_b = runs[1].outputs["answer"]
    question = example.inputs["question"]

    llm = ChatOpenAI(model="gpt_4o_mini", temperature=0)

    grade_prompt = PromptTemplate.from_template(
    """
    You are an LLM judge. Compare the following two answers to a question and determine which one is better.
    Better answer is the one that is more detailed and informative.
    If the answer is not related to the question, it is not a good answer.

        
    # Question:
    {question}
        
    #Answer A: 
    {answer_a}
        
    #Answer B: 
    {answer_b}
        
    Output should be either `A` or `B`. Pick the answer that is better.
        
    #Preference:
    """
    )
    answer_grader = grade_prompt | llm | StrOutputParser()

    score = answer_grader.invoke(
        {
            "question" : question,
            "answer_a" : answer_a,
            "answer_b" : answer_b
        }
    )

    if score == "A":
        scores[runs[0].id] = 1
        scores[runs[1].id] = 0
    elif score == "B":
        scores[runs[0].id] = 0
        scores[runs[1].id] = 1
    else:
        scores[runs[0].id] = 0
        scores[runs[1].id] = 0

    return {"key": "ranked_preference","scores": scores}


In [3]:
from rag import PDFRAG
from langchain_openai import ChatOpenAI

def ask_question_with_llm(llm):

    rag=PDFRAG(
        "data/snow-white.pdf",
        llm
    )

    retriever = rag.create_retriever()

    rag_chain = rag.create_chain(retriever)

    def _ask_question(inputs:dict):
        context = retriever.invoke(inputs["question"])
        context = "\n".join([doc.page_content for doc in context])
        return {
            "question" : inputs["question"],
            "context" : context,
            "answer" : rag_chain.invoke(inputs["question"])
        }
    return _ask_question

In [4]:
from langchain_openai import ChatOpenAI

gpt3 = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

gpt3.invoke("안녕하세요?")

AIMessage(content='안녕하세요! 무엇을 도와드릴까요?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 13, 'total_tokens': 34, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-bf486623-aa4e-4ba3-bf3d-31e3f26566db-0', usage_metadata={'input_tokens': 13, 'output_tokens': 21, 'total_tokens': 34, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 0}})

In [5]:
gpt4o_chain = ask_question_with_llm(ChatOpenAI(model="gpt-4o-mini", temperature=0))
gpt3_chain = ask_question_with_llm(ChatOpenAI(model="gpt-3.5-turbo", temperature=0))

# ollama 사용시
# ollama_chain = ask_question_with_llm(ChatOllam(model=""))

In [6]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

cot_qa_evaluator = LangChainStringEvaluator(
    "cot_qa",
    config={"llm": ChatOpenAI(model="gpt-4o-mini", temperature=0)}, # 평가자
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": run.outputs["context"],
        "input": example.inputs["question"]
    }
)

dataset_name = "RAG_EVALUTION_DATASET"

experiment_result1 = evaluate(
    gpt3_chain,
    data=dataset_name,
    evaluators=[cot_qa_evaluator],
    experiment_prefix="MODEL_COMPARE_EVALUATION",
    metadata={
        "variant": "GPT-3.5-turbo 평가 (cot_qa)"
    }
)

experiment_result2 = evaluate(
    gpt4o_chain,
    data=dataset_name,
    evaluators=[cot_qa_evaluator],
    experiment_prefix="MODEL_COMPARE_EVALUATION",
    metadata={
        "variant": "GPT-4o-mini 평가 (cot_qa)"
    }
)

View the evaluation results for experiment: 'MODEL_COMPARE_EVALUATION-b5190e80' at:
https://smith.langchain.com/o/0dd3457e-dff5-49bf-8543-15fbe60fbc08/datasets/0a7e99f7-e890-41a7-86f8-020f54eec14d/compare?selectedSessions=4d239271-957a-4ece-82cb-1635eb48b3bf




0it [00:00, ?it/s]

View the evaluation results for experiment: 'MODEL_COMPARE_EVALUATION-6f9a509f' at:
https://smith.langchain.com/o/0dd3457e-dff5-49bf-8543-15fbe60fbc08/datasets/0a7e99f7-e890-41a7-86f8-020f54eec14d/compare?selectedSessions=dd75b169-73e7-4fc0-9359-f0dc8a3256b8




0it [00:00, ?it/s]

In [None]:
from langsmith.evaluation import evaluate_comparative

evaluate_comparative(
    ["MODEL_COMPARE_EVALUATION-6f9a509f", "MODEL_COMPARE_EVALUATION-b5190e80"],

    # 평가자
    evaluators=[evaluate_pairwise]
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/0dd3457e-dff5-49bf-8543-15fbe60fbc08/datasets/0a7e99f7-e890-41a7-86f8-020f54eec14d/compare?selectedSessions=77a356f7-b864-407d-b252-a59a79bda14c%2Cd944a1e4-8823-4ad9-a9cc-815269452864&comparativeExperiment=84f43c3e-1425-4ce2-8cb5-2d9bc19a617a




  0%|          | 0/5 [00:00<?, ?it/s]

NotFoundError: Error code: 404 - {'error': {'message': 'The model `gpt_4o_mini` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}

## ollama

In [None]:
!pip install langchain-ollama

Collecting langchain-ollama
  Downloading langchain_ollama-0.2.0-py3-none-any.whl.metadata (1.8 kB)
Collecting ollama<1,>=0.3.0 (from langchain-ollama)
  Downloading ollama-0.3.3-py3-none-any.whl.metadata (3.8 kB)
Downloading langchain_ollama-0.2.0-py3-none-any.whl (14 kB)
Downloading ollama-0.3.3-py3-none-any.whl (10 kB)
Installing collected packages: ollama, langchain-ollama
Successfully installed langchain-ollama-0.2.0 ollama-0.3.3


In [None]:
from langchain_ollama import ChatOllama

# Ollama 모델을 불러옵니다.
ollama = ChatOllama(model="")

# Ollama 모델 호출
ollama.invoke("안녕하세요?")

RequestError: must provide a model