In [2]:
import openai
import os

from dotenv import find_dotenv, load_dotenv


_ = load_dotenv(find_dotenv())
openai.api_key = os.environ["OPENAI_API_KEY"]

MODEL_NAME = os.environ["MODEL_NAME"]

In [None]:
from openai import OpenAI
from pydantic import BaseModel, Field


client = OpenAI()


class Similarity_Score(BaseModel):
    similarity_score: int = Field(description="Semantic similarity score between 1 and 10, where 1 means unrelated and 10 means identical.")


def compare_semantic_similarity(inputs: Dict, reference_outputs: Dict, outputs: Dict):
    input_question = inputs["question"]
    reference_response = reference_outputs["output"]
    run_response = outputs["output"]
    
    completion = client.beta.chat.completions.parse(
        model=MODEL_NAME,
        messages=[
            {   
                "role": "system",
                "content": (
                    "You are a semantic similarity evaluator. Compare the meanings of two responses to a question, "
                    "Reference Response and New Response, where the reference is the correct answer, and we are trying to judge if the new response is similar. "
                    "Provide a score between 1 and 10, where 1 means completely unrelated, and 10 means identical in meaning."
                ),
            },
            {"role": "user", "content": f"Question: {input_question}\n Reference Response: {reference_response}\n Run Response: {run_response}"}
        ],
        response_format=Similarity_Score,
    )

    similarity_score = completion.choices[0].message.parsed
    return {"score": similarity_score.similarity_score, "key": "similarity"}


In [5]:
# from Dataset Example
inputs = {
	"question": "Is LangSmith natively integrated with LangChain?"
}
reference_outputs = {
	"output": "Yes, LangSmith is natively integrated with LangChain, as well as LangGraph."
}


# from Run
outputs = {
	"output": "No, LangSmith is NOT integrated with LangChain."
}

similarity_score = compare_semantic_similarity(inputs, reference_outputs, outputs)
print(f"Semantic similarity score: {similarity_score}")

Semantic similarity score: {'score': 1, 'key': 'similarity'}


In [6]:
# you can also define evaluators using Run and Example directly.

from langsmith.schemas import Run, Example


def compare_semantic_similarity_v2(root_run: Run, example: Example):
    input_question = example["inputs"]["question"]
    reference_response = example["outputs"]["output"]
    run_response = root_run["outputs"]["output"]
    
    completion = client.beta.chat.completions.parse(
        model=MODEL_NAME,
        messages=[
            {   
                "role": "system",
                "content": (
                    "You are a semantic similarity evaluator. Compare the meanings of two responses to a question, "
                    "Reference Response and New Response, where the reference is the correct answer, and we are trying to judge if the new response is similar. "
                    "Provide a score between 1 and 10, where 1 means completely unrelated, and 10 means identical in meaning."
                ),
            },
            {"role": "user", "content": f"Question: {input_question}\n Reference Response: {reference_response}\n Run Response: {run_response}"}
        ],
        response_format=Similarity_Score,
    )

    similarity_score = completion.choices[0].message.parsed
    return {"score": similarity_score.similarity_score, "key": "similarity"}

In [7]:
sample_run = {
	"name": "Sample Run",
	"inputs": {
		"question": "Is LangSmith natively integrated with LangChain?"
	},
	"outputs": {
		"output": "No, LangSmith is NOT integrated with LangChain."
	},
	"is_root": True,
	"status": "success",
	"extra": {
		"metadata": {
		"key": "value"
		}
	}
}

sample_example = {
	"inputs": {
		"question": "Is LangSmith natively integrated with LangChain?"
	},
	"outputs": {
		"output": "Yes, LangSmith is natively integrated with LangChain, as well as LangGraph."
	},
	"metadata": {
		"dataset_split": [
		"AI generated",
		"base"
		]
	}
}

similarity_score = compare_semantic_similarity_v2(sample_run, sample_example)
print(f"Semantic similarity score: {similarity_score}")

Semantic similarity score: {'score': 1, 'key': 'similarity'}
