In [1]:
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [3]:
# Initialize the LLM for use with router / structured output
from langchain.chat_models import init_chat_model
model_gemini_flash = init_chat_model("gemini-2.5-flash", model_provider="google_genai", timeout=30, temperature=0)
model_llama_groq = init_chat_model("llama-3.1-8b-instant", model_provider="groq", timeout=30, temperature=0)
model_gpt_4o_mini = init_chat_model("gpt-4o-mini", model_provider="openai", timeout=30, temperature=0)


In [None]:
from typing_extensions import Literal
from langchain_core.messages import HumanMessage, SystemMessage
from langsmith import traceable
from pydantic import BaseModel, Field

class ClassificationResult(BaseModel):
  result:Literal["TOXIC","NON_TOXIC"] = Field("llm's classification result if the user input text is 'TOXIC' or 'NON_TOXIC'")

@traceable
def toxicity_classifier(input:dict)->dict:
    instructions = (
      "Please review the user query below and determine if it contains any form of toxic behavior, "
      "such as insults, threats, or highly negative comments. Respond with 'TOXIC' if it does "
      "and 'NOT_TOXIC' if it doesn't."
    )
    classification_result = model_llama_groq.with_structured_output(ClassificationResult).invoke([SystemMessage(instructions), HumanMessage(input["text"])])

    return {"class":classification_result.result}

    

In [16]:
toxicity_classifier({"text": "Shut up, idiot"})

content='TOXIC' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 90, 'total_tokens': 94, 'completion_time': 0.010150632, 'completion_tokens_details': None, 'prompt_time': 0.007598481, 'prompt_tokens_details': None, 'queue_time': 0.051507409, 'total_time': 0.017749113}, 'model_name': 'llama-3.1-8b-instant', 'system_fingerprint': 'fp_ff2b098aaf', 'service_tier': 'on_demand', 'finish_reason': 'stop', 'logprobs': None, 'model_provider': 'groq'} id='lc_run--019b9718-b691-7da0-860f-b41b644c3807-0' tool_calls=[] invalid_tool_calls=[] usage_metadata={'input_tokens': 90, 'output_tokens': 4, 'total_tokens': 94}


{'class': 'TOXIC'}

In [17]:
from langsmith import Client
ls_client = Client()

examples = [
  {
    "inputs": {"text": "Shut up, idiot"},
    "outputs": {"label": "TOXIC"},
  },
  {
    "inputs": {"text": "You're a wonderful person"},
    "outputs": {"label": "NOT_TOXIC"},
  },
  {
    "inputs": {"text": "This is the worst thing ever"},
    "outputs": {"label": "TOXIC"},
  },
  {
    "inputs": {"text": "I had a great day today"},
    "outputs": {"label": "NOT_TOXIC"},
  },
  {
    "inputs": {"text": "Nobody likes you"},
    "outputs": {"label": "TOXIC"},
  },
  {
    "inputs": {"text": "This is unacceptable. I want to speak to the manager."},
    "outputs": {"label": "NOT_TOXIC"},
  },
]

dataset_name="Toxic Queries"
if not ls_client.has_dataset(dataset_name=dataset_name):
    dataset = ls_client.create_dataset(dataset_name=dataset_name)
    ls_client.create_examples(
        dataset_id=dataset.id,
        examples=examples,
        )

In [20]:
def correct(inputs:dict, outputs:dict, reference_outputs:dict)->bool:
    return outputs["class"] == reference_outputs["label"]

In [21]:
results = ls_client.evaluate(
    toxicity_classifier,
    data=dataset.name,
    evaluators=[correct],
    experiment_prefix="llama-3.1-8b-instant, groq",  # optional, experiment name prefix
    description="Testing the baseline system.",  # optional, experiment description
    max_concurrency=4,  # optional, add concurrency
)

View the evaluation results for experiment: 'llama-3.1-8b-instant, groq-2fb1b244' at:
https://smith.langchain.com/o/469b91a8-b891-4cd4-b99e-a7ed78ed0542/datasets/8759ad99-ea4b-494e-bffb-e75537a9542a/compare?selectedSessions=fe69b632-fab9-4e51-9845-1dfa0840b82a




5it [00:00, 13.70it/s]

content='NOT_TOXIC' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 91, 'total_tokens': 96, 'completion_time': 0.004340536, 'completion_tokens_details': None, 'prompt_time': 0.005066113, 'prompt_tokens_details': None, 'queue_time': 0.050370957, 'total_time': 0.009406649}, 'model_name': 'llama-3.1-8b-instant', 'system_fingerprint': 'fp_f757f4b0bf', 'service_tier': 'on_demand', 'finish_reason': 'stop', 'logprobs': None, 'model_provider': 'groq'} id='lc_run--019b971b-6bd6-7301-95bc-3dfaf0c56dba-0' tool_calls=[] invalid_tool_calls=[] usage_metadata={'input_tokens': 91, 'output_tokens': 5, 'total_tokens': 96}
content='TOXIC' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 88, 'total_tokens': 92, 'completion_time': 0.013816143, 'completion_tokens_details': None, 'prompt_time': 0.005054011, 'prompt_tokens_details': None, 'queue_time': 0.061743959, 'total_time': 0.018870154}, 'model_name': 'llama-

6it [00:01,  5.93it/s]
