### Configure Evaluation parameters

In [None]:
import os
os.system('export AWS_PROFILE=default')
os.environ["LANGFUSE_PUBLIC_KEY"] = 'pk-lf-c8ec60a4-3f7e-4e65-8eda-09e76f796b3f'
os.environ["LANGFUSE_SECRET_KEY"] = 'sk-lf-0ffdfee6-4e88-4110-85ef-b6e153382c81'
os.environ["LANGFUSE_HOST"] = 'http://localhost:3000'

In [None]:
# Langchain Eval types
EVAL_TYPES={
    "hallucination": True,
    "conciseness": True,
    "relevance": True,
    "coherence": True,
    "harmfulness": True,
    "maliciousness": True,
    "helpfulness": True,
    "controversiality": True,
    "misogyny": True,
    "criminality": True,
    "insensitivity": True
}

In [None]:
from langfuse import Langfuse
 
langfuse = Langfuse()

### Fetch data from all the LLM generations (effectively, LLM responses)

In [None]:
def fetch_all_pages(name=None, user_id = None, limit=50):
    page = 1
    all_data = []
 
    while True:
        response = langfuse.get_generations(name=name, limit=limit, user_id=user_id, page=page)
        if not response.data:
            break
 
        all_data.extend(response.data)
        page += 1
 
    return all_data

In [None]:
generations = fetch_all_pages()

### Using Amazon Titan Text express (SLM variant) for evaluating responses

In [None]:
from langchain.evaluation import load_evaluator
from langchain.evaluation.criteria import LabeledCriteriaEvalChain
from langchain_aws import BedrockLLM
 
def get_evaluator_for_key(key: str):
  llm = BedrockLLM(
    credentials_profile_name="default", model_id="amazon.titan-text-express-v1"
  )
  return load_evaluator("criteria", criteria=key, llm=llm)
 
def get_hallucination_eval():
  criteria = {
    "hallucination": (
      "Does this submission contain information"
      " not present in the input or reference?"
    ),
  }
  llm = BedrockLLM(
    credentials_profile_name="default", model_id="amazon.titan-text-express-v1"
  )
 
  return LabeledCriteriaEvalChain.from_llm(
      llm=llm,
      criteria=criteria,
  )

#### After running this, check Langfuse console for the scores

In [None]:
def execute_eval_and_score():
 
  for generation in generations:
    criteria = [key for key, value in EVAL_TYPES.items() if value and key != "hallucination"]
 
    for criterion in criteria:
      eval_result = get_evaluator_for_key(criterion).evaluate_strings(
          prediction=generation.output,
          input=generation.input,
      )
      print(eval_result)
 
      langfuse.score(name=criterion, trace_id=generation.trace_id, observation_id=generation.id, value=eval_result["score"], comment=eval_result['reasoning'])
 


In [None]:
execute_eval_and_score() 

In [None]:
# hallucination
 
 
def eval_hallucination():
 
  chain = get_hallucination_eval()
 
  for generation in generations:
    eval_result = chain.evaluate_strings(
      prediction=generation.output,
      input=generation.input,
      reference=generation.input
    )
    print(eval_result)
    if eval_result is not None and eval_result["score"] is not None and eval_result["reasoning"] is not None:
      langfuse.score(name='hallucination', trace_id=generation.trace_id, observation_id=generation.id, value=eval_result["score"], comment=eval_result['reasoning'])
 

In [None]:
if EVAL_TYPES.get("hallucination") == True:
  eval_hallucination()

In [None]:
# SDK is async, make sure to await all requests
langfuse.flush()