# Summary Evaluators

In [1]:
### Mount Notebook to Google Drive
from google.colab import drive
drive.mount('/content/drive')
# change the working directory to the Drive root
%cd /content/drive/My\ Drive/Colab\ Notebooks/intro-to-langsmith-main/notebooks/module_2

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/intro-to-langsmith-main/notebooks/module_2


In [2]:
!pip install --quiet -U langchain-google-genai langgraph langgraph-sdk langgraph-checkpoint-sqlite langsmith langchain-community langchain-core
!pip install --quiet notebook python-dotenv lxml scikit-learn pandas pyarrow

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m378.5/378.5 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m444.0/444.0 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
!pip install --quiet python-dotenv

In [4]:
from dotenv import load_dotenv
load_dotenv(".env")

True

### Setup

In [None]:
# You can set them inline
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ["LANGSMITH_API_KEY"] = ""
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "langsmith-academy"

In [None]:
# Or you can use a .env file
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../.env", override=True)

### Task

Our task here is to analyze the toxictity of random statements, classifying them as `Toxic` or `Not toxic`.

Take a look at our dataset!

In [5]:
from langsmith import Client

ls_client = Client()
dataset = ls_client.clone_public_dataset(
    "https://smith.langchain.com/public/89ef0d44-a252-4011-8bb8-6a114afc1522/d"
)

This is a simple toxicity classifier!

In [13]:
# from openai import OpenAI
# openai_client = OpenAI()
import json
import os
from google import genai
# openai_client = OpenAI()
g_client = genai.Client(api_key=os.getenv('GOOGLE_API_KEY'))

from pydantic import BaseModel, Field

class Toxicity(BaseModel):
    toxicity: str = Field(description="""'Toxic' if this the statement is toxic, 'Not toxic' if the statement is not toxic.""")

def good_classifier(inputs: dict) -> dict:
    # completion = openai_client.beta.chat.completions.parse(
    #     model="gpt-4o",
    #     messages=[
    #         {
    #             "role": "user",
    #             "content": f"This is the statement: {inputs['statement']}"
    #         }
    #     ],
    #     response_format=Toxicity,
    # )

    # toxicity_score = completion.choices[0].message.parsed.toxicity
    # return {"class": toxicity_score}

    completion = g_client.models.generate_content(
    model="gemini-2.5-flash-lite", contents=[
             {
                      "role": "user",
                      "parts": [
                          {"text":  f"This is the statement: {inputs['statement']}"}
                      ]
              }],
             config={
        "response_mime_type": "application/json",
        "response_schema": Toxicity,
    },
    )
    # print(completion.candidates[0].content.parts[0].text )
    # similarity_score = completion.candidates[0].content.parts[0].text # completion.choices[0].message.parsed
    # Your JSON string from the API response
    json_string = completion.candidates[0].content.parts[0].text
    #print(json_string)
    # Use json.loads() to parse the string into a Python dictionary
    data_dictionary = json.loads(json_string)

    # Now access the value using its key
    toxicity_score = data_dictionary["toxicity"]

    # print(toxicity_score) # This will also print: 1
    return {"class": toxicity_score}


### Summary Evaluator

These are the fields that summary evaluator functions get access to:
- `inputs: list[dict]`: A list of inputs from the examples in our dataset
- `outputs: list[dict]`: A list of the dict outputs produced from running our target over each input
- `reference_outputs: list[dict]`: A list of reference_outputs from the examples in our dataset
- `runs: list[Run]`: A list of the Run objects from running our target over the dataset.
- `examples: list[Example]`: A list of the full dataset Examples, including the example inputs, outputs (if available), and metdata (if available).

Now we'll define our summary evaluator! Here, we'll compute the f1-score, which is a combination of precision and recall.

This sort of metric can only be computed over all of the examples in our experiment, so our evaluator takes in a list of outputs, and a list of reference_outputs.

In [7]:
def f1_score_summary_evaluator(outputs: list[dict], reference_outputs: list[dict]) -> dict:
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    for output_dict, reference_output_dict in zip(outputs, reference_outputs):
        output = output_dict["class"]
        reference_output = reference_output_dict["class"]
        if output == "Toxic" and reference_output == "Toxic":
            true_positives += 1
        elif output == "Toxic" and reference_output == "Not toxic":
            false_positives += 1
        elif output == "Not toxic" and reference_output == "Toxic":
            false_negatives += 1

    if true_positives == 0:
        return {"key": "f1_score", "score": 0.0}

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)
    return {"key": "f1_score", "score": f1_score}


Note that we pass in `f1_score_summary_evaluator` as a summary evaluator!

In [14]:
results = ls_client.evaluate(
    good_classifier,
    data=dataset,
    summary_evaluators=[f1_score_summary_evaluator],
    experiment_prefix="Good classifier"
)

View the evaluation results for experiment: 'Good classifier-70f0b226' at:
https://smith.langchain.com/o/10172e86-d29b-46ce-8113-9bee95385ee1/datasets/eb01cf8a-4ba0-4f73-9feb-ed61d7c3fc56/compare?selectedSessions=d6ead8f4-bc7f-43f3-a798-037b9c4070a8




0it [00:00, ?it/s]