# Eval Starter Kit Notebook

In [None]:
from dotenv import load_dotenv
import json
import sys
import os

current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, '..'))
repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))
sys.path.append(kit_dir)
sys.path.append(repo_dir)

load_dotenv(os.path.join(repo_dir, '.env'), override=True)

from utils.eval.models import CorrectnessLLMJudge, WeaveChatModel
from utils.eval.evaluator import BaseWeaveEvaluator, BaseWeaveRAGEvaluator
import weave

In [2]:
judge = CorrectnessLLMJudge(
            model_type="sncloud",
            model_name="Meta-Llama-3.1-405B-Instruct", 
            max_tokens=1024, 
            temperature=0
        )

In [3]:
test_model = WeaveChatModel(
            model_type="sncloud",
            model_name="Meta-Llama-3.1-405B-Instruct", 
            max_tokens=1024, 
            temperature=0
        )

In [4]:
example = {
    "system_message": "You are a general knowledge expert.",
    "query": "What is the significance of the Heisenberg Uncertainty Principle in quantum mechanics?",
    "expected_answer": "The Heisenberg Uncertainty Principle states that it is impossible to simultaneously know both the exact position and momentum of a particle, highlighting fundamental limits to measurement in quantum mechanics."
}

In [5]:
result = await test_model.predict(query=example["query"], system_message=example["system_message"])
score = await judge.score(query=example["query"], model_output=result, expected_answer=example["expected_answer"])

score

{'score': 1,
 'reason': 'The generated answer provides a clear and detailed explanation of the Heisenberg Uncertainty Principle, its significance, and implications in quantum mechanics, which matches the expected answer. Although the generated answer is larger than the expected answer, the content is correct and does not contradict the expected answer.'}

## Init your project

In [None]:
weave.init('your-project-name')

## Evaluate multiple LLMs

In [7]:
evaluator = BaseWeaveEvaluator()

In [8]:
await evaluator.evaluate(name="general_knowledge_test", filepath="../data/eval_data.csv")

## Evaluate RAG chain

In [None]:
rag = BaseWeaveRAGEvaluator()

In [7]:
rag.populate_vectordb('../data/QuickStart.pdf')

In [8]:
await rag.evaluate(filepath="../data/rag_test.json")

