# Custom String Evaluator

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from typing import Any, Optional

from evaluate import load
from langchain.evaluation import StringEvaluator


class PerplexityEvaluator(StringEvaluator):
    """Evaluate the perplexity of a predicted string."""

    def __init__(self, model_id: str = "gpt2"):
        self.model_id = model_id
        self.metric_fn = load(
            "perplexity", module_type="metric", model_id=self.model_id, pad_token=0
        )

    def _evaluate_strings(
        self,
        *,
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        results = self.metric_fn.compute(
            predictions=[prediction], model_id=self.model_id
        )
        ppl = results["perplexities"][0]
        return {"score": ppl}

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
evaluator = PerplexityEvaluator()

In [5]:
evaluator.evaluate_strings(prediction="The rains in Spain fall mainly on the plain.")

100%|██████████| 1/1 [00:01<00:00,  1.62s/it]


{'score': 190.36737060546875}

In [6]:
# LangChainはgpt-2がリリースされた後に導入されたものであり、また、以下の文脈では決して使用されないため、当惑感はより強い。
evaluator.evaluate_strings(prediction="The rains in Spain fall mainly on LangChain.")

100%|██████████| 1/1 [00:00<00:00,  2.10it/s]


{'score': 1982.0709228515625}