# Pairwise Experiments

### Setup

In [1]:
# Or you can use a .env file
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../.env", override=True)

True

### Task

Let's set up a new task! Here, we have a salesperson named Bob. Bob has a lot of deals, so he wants to summarize what happened in this deals based off of some meeting transcripts.

Bob is iterating on a few different prompts, that will give him nice, concise transcripts for his deals.

Bob has curated a dataset of his deal transcripts, let's go ahead and load that in. You can take a look at the dataset as well if you're curious! Note that this is not a golden dataset, there is no reference output here.

In [2]:
from langsmith import Client

client = Client()
dataset = client.clone_public_dataset(
  "https://smith.langchain.com/public/9078d2f1-7bef-4ba7-b795-210a17682ef9/d"
)

### Experiments

Now, let's run some experiments on this dataset using two different prompts. Let's add an evaluator that tries to score how good our summaries are!

In [3]:
from pydantic import BaseModel, Field
from openai import OpenAI

openai_client = OpenAI()

SUMMARIZATION_SYSTEM_PROMPT = """You are a judge, aiming to score how well a summary summarizes the content of a transcript"""

SUMMARIZATION_HUMAN_PROMPT = """
[The Meeting Transcript] {transcript}
[The Start of Summarization] {summary} [The End of Summarization]"""

class SummarizationScore(BaseModel):
    score: int = Field(description="""A score from 1-5 ranking how good the summarization is for the provided transcript, with 1 being a bad summary, and 5 being a great summary""")
    
def summary_score_evaluator(inputs: dict, outputs: dict) -> list:
    completion = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {   
                "role": "system",
                "content": SUMMARIZATION_SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": SUMMARIZATION_HUMAN_PROMPT.format(
                    transcript=inputs["transcript"],
                    summary=outputs.get("output", "N/A"),
                )}
        ],
        response_format=SummarizationScore,
    )

    summary_score = completion.choices[0].message.parsed.score
    return {"key": "summary_score", "score": summary_score}

First, we'll run our experiment with a good version of our prompt!

In [4]:
# Prompt One: Good Prompt!
def good_summarizer(inputs: dict):
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": f"Concisely summarize this meeting in 3 sentences. Make sure to include all of the important events. Meeting: {inputs['transcript']}"
            }
        ],
    )
    return response.choices[0].message.content

client.evaluate(
    good_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Good Summarizer"
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'Good Summarizer-065a8ca1' at:
https://smith.langchain.com/o/58237f5e-f0c5-4c78-b71c-186c54d72106/datasets/8114f85d-c0f7-45f6-a94b-e418ef4038b9/compare?selectedSessions=aa3f1a70-6d81-4e00-828f-45ebd9f5b12e




5it [00:13,  2.72s/it]


Unnamed: 0,inputs.transcript,outputs.output,error,feedback.summary_score,execution_time,example_id,id
0,Bob and Mr. Carter (CLOSED DEAL): Bob: Welcome...,Bob and Mr. Carter met to discuss trading in M...,,5,1.74095,0ba6de63-8145-4740-a8e6-41d18435455f,726f4ef6-cf8a-4dab-a700-64c4a486c93f
1,"Bob and Mr. Patel (CLOSED DEAL): Bob: Hello, M...",Bob and Mr. Patel met to discuss Mr. Patel's i...,,5,2.150314,11fe2b39-3361-4204-a986-fef875f9883a,8e51b010-043c-4298-99fb-0d54e8ed4d9e
2,Bob and Ms. Nguyen (NO DEAL): Bob: Good aftern...,"In the meeting, Bob presented various vehicle ...",,5,1.36402,4c4fbaae-5cb4-4d3d-9697-3be774b1767f,6af78ba5-30d1-4cf7-9fb4-65ba4087dc61
3,"Bob and Ms. Thompson (NO DEAL): Bob: Hi, Ms. T...","Bob welcomed Ms. Thompson to Ford Motors, wher...",,5,1.706803,570fa57c-a170-4f70-b597-cb5bbd705ec1,2e2ec9c4-0fc0-4f67-9d3a-3a7c0444bb95
4,Bob and Mr. Johnson (CLOSED DEAL): Bob: Good m...,"Bob and Mr. Johnson met at Ford Motors, where ...",,5,2.249546,6b441f19-7775-4ea4-ba2c-22ab6ff8db81,cf290742-031b-4c22-96a1-7f888c66a904


Now, we'll run an experiment with a worse version of our prompt, to highlight the difference.

In [5]:
# Prompt Two: Worse Prompt!
def bad_summarizer(inputs: dict):
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": f"Summarize this in one sentence. {inputs['transcript']}"
            }
        ],
    )
    return response.choices[0].message.content

client.evaluate(
    bad_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Bad Summarizer"
)

View the evaluation results for experiment: 'Bad Summarizer-a051143b' at:
https://smith.langchain.com/o/58237f5e-f0c5-4c78-b71c-186c54d72106/datasets/8114f85d-c0f7-45f6-a94b-e418ef4038b9/compare?selectedSessions=64c4a610-b7e0-45bd-b60e-1705c1e75183




5it [00:11,  2.26s/it]


Unnamed: 0,inputs.transcript,outputs.output,error,feedback.summary_score,execution_time,example_id,id
0,Bob and Mr. Carter (CLOSED DEAL): Bob: Welcome...,Bob successfully closed a deal with Mr. Carter...,,5,1.561225,0ba6de63-8145-4740-a8e6-41d18435455f,fe00edf2-82ca-4b9e-842c-38902cecafcc
1,"Bob and Mr. Patel (CLOSED DEAL): Bob: Hello, M...",Bob successfully closed a deal with Mr. Patel ...,,5,1.326149,11fe2b39-3361-4204-a986-fef875f9883a,51bb030e-6106-4399-8944-d942609d18ff
2,Bob and Ms. Nguyen (NO DEAL): Bob: Good aftern...,Bob suggested several vehicle options to Ms. N...,,4,1.290431,4c4fbaae-5cb4-4d3d-9697-3be774b1767f,7a628fc7-68b1-4be4-b85e-e75d5c630dbe
3,"Bob and Ms. Thompson (NO DEAL): Bob: Hi, Ms. T...",Bob and Ms. Thompson discussed her interest in...,,5,1.368918,570fa57c-a170-4f70-b597-cb5bbd705ec1,2f1142f9-ee35-487b-91b3-37d4835fb8f2
4,Bob and Mr. Johnson (CLOSED DEAL): Bob: Good m...,Bob successfully closed a deal with Mr. Johnso...,,5,1.460293,6b441f19-7775-4ea4-ba2c-22ab6ff8db81,27b630c1-0530-4c79-9d3c-369e70148e96


### Pairwise Experiment

Let's define a function that will compare our two experiments. These are the fields that pairwise evaluator functions get access to:
- `inputs: dict`: A dictionary of the inputs corresponding to a single example in a dataset.
- `outputs: list[dict]`: A list of the dict outputs produced by each experiment on the given inputs.
- `reference_outputs: dict`: A dictionary of the reference outputs associated with the example, if available.
- `runs: list[Run]`: A list of the full Run objects generated by the experiments on the given example. Use this if you need access to intermediate steps or metadata about each run.
- `example: Example`: The full dataset Example, including the example inputs, outputs (if available), and metdata (if available).

First, let's give our LLM-as-Judge some instructions. In our case, we're just going to directly use LLM-as-judge to grade which of the summarizers is the most helpful.

It might be hard to grade our summarizers without a ground truth reference, but here, comparing different prompts head to head will give us a sense of which is better!

In [6]:
JUDGE_SYSTEM_PROMPT = """
Please act as an impartial judge and evaluate the quality of the summarizations provided by two AI summarizers to the meeting transcript below.
Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their summarizations. 
Begin your evaluation by comparing the two summarizations and provide a short explanation. 
Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. 
Do not favor certain names of the assistants. 
Be as objective as possible. """

JUDGE_HUMAN_PROMPT = """
[The Meeting Transcript] {transcript}

[The Start of Assistant A's Summarization] {answer_a} [The End of Assistant A's Summarization]

[The Start of Assistant B's Summarization] {answer_b} [The End of Assistant B's Summarization]"""

Our function will take in an `inputs` dictionary, and a list of `outputs` dictionaries for the different experiments that we want to compare.

In [7]:
from pydantic import BaseModel, Field

class Preference(BaseModel):
    preference: int = Field(description="""1 if Assistant A answer is better based upon the factors above.
2 if Assistant B answer is better based upon the factors above.
Output 0 if it is a tie.""")
    
def ranked_preference(inputs: dict, outputs: list[dict]) -> list:
    completion = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {   
                "role": "system",
                "content": JUDGE_SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": JUDGE_HUMAN_PROMPT.format(
                    transcript=inputs["transcript"],
                    answer_a=outputs[0].get("output", "N/A"),
                    answer_b=outputs[1].get("output", "N/A")
                )}
        ],
        response_format=Preference,
    )

    preference_score = completion.choices[0].message.parsed.preference

    if preference_score == 1:
        scores = [1, 0]
    elif preference_score == 2:
        scores = [0, 1]
    else:
        scores = [0, 0]
    return scores

Now let's run our pairwise experiment with `evaluate()`

In [8]:
from langsmith import evaluate

evaluate(
    ("Good Summarizer-312944ad", "Bad Summarizer-6713951c"),  # TODO: Replace with the names/IDs of your experiments
    evaluators=[ranked_preference]
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/58237f5e-f0c5-4c78-b71c-186c54d72106/datasets/8114f85d-c0f7-45f6-a94b-e418ef4038b9/compare?selectedSessions=b36d6bce-e1b9-416e-ba11-38d4e855fed6%2C096052d6-5ab8-4bdb-86b8-7f6bb4d189e7&comparativeExperiment=5d6bda88-a1a2-4e6a-993a-ea0c1812ddff




100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


<langsmith.evaluation._runner.ComparativeExperimentResults at 0x111a601a0>

When we compared them head to head we could see that the llm always prefers the good prompt in all the 5 cases, even though independently their evaluation scores were the same

![image.png](attachment:image.png)

## Imported my own dataset and defined a pairwise experiment of my own

In [9]:
from langsmith import Client

client = Client()
dataset = client.clone_public_dataset(
  "https://smith.langchain.com/public/fa1764ff-56cd-420b-b19a-be3d8fbad4c3/d"
)


In [10]:
from pydantic import BaseModel, Field

class SummarizationScore(BaseModel):
    score: int = Field(description="A score from 1-5 ranking how good the answer is for the customer support question, with 1 being bad and 5 being great.")

SUMMARIZATION_SYSTEM_PROMPT = """You are a judge, aiming to score how well an answer addresses the customer's support question."""

SUMMARIZATION_HUMAN_PROMPT = """
The Customer Support Question: {question}
[Start of Answer] {summary} [End of Answer]
"""

def summary_score_evaluator(inputs: dict, outputs: dict) -> list:
    completion = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
            {
                "role": "user",
                "content": SUMMARIZATION_HUMAN_PROMPT.format(
                    question=inputs["question"],
                    summary=outputs.get("output", "N/A"),
                ),
            },
        ],
        response_format=SummarizationScore,
    )
    summary_score = completion.choices[0].message.parsed.score
    return {"key": "summary_score", "score": summary_score}


In [11]:
# Cell 1: Good summarizer
def good_summarizer(inputs: dict):
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": f"Provide a detailed and accurate answer for the following customer support query:\nQuestion: {inputs['question']}"
            }
        ]
    )
    return response.choices[0].message.content

client.evaluate(
    good_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Good Summarizer")


View the evaluation results for experiment: 'Good Summarizer-e712b79a' at:
https://smith.langchain.com/o/58237f5e-f0c5-4c78-b71c-186c54d72106/datasets/31ad4f5b-9dc5-4879-bf59-24dfd22e0aec/compare?selectedSessions=a576bac4-912e-4b62-b1a0-795850581409




5it [00:52, 10.48s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.answer,feedback.summary_score,execution_time,example_id,id
0,Do you offer student discounts?,Thank you for reaching out with your question!...,,"Yes, we offer 20% off for students. Verify you...",4,13.474874,4dde21b5-4c14-473d-abbc-527cc2d64425,5b3c53af-7dcb-4608-b5df-b169b60569c3
1,Can I change my subscription plan?,"Yes, you can change your subscription plan, an...",,Yes! Go to Account Settings > Subscription and...,5,9.424366,76b1a3f0-860c-4c20-a2ab-2bde109a5dd0,4c045621-e23c-4a47-a255-91a13d3ea92b
2,How long does shipping take?,The shipping duration can vary based on severa...,,Standard shipping takes 5-7 business days. Exp...,5,7.787275,5dd918e2-cd7c-4f8f-ae16-24cb202f3256,a7d469a7-e609-4cf0-b0d3-7444733a0a21
3,What is your refund policy?,Our refund policy is designed to ensure custom...,,We offer a 30-day money-back guarantee. Contac...,5,8.182361,8df3cdfe-b57c-4aea-8f9a-ee5a7d2469d1,f156aa9a-f229-4d0c-94f2-8e757267dad3
4,How do I reset my password?,"If you need to reset your password, you can fo...",,"Go to the login page, click 'Forgot Password',...",4,8.807558,c13f215e-cc0b-4861-b3a4-f2fcf9e3fb4f,645e9ce0-1ba2-497c-a926-7b8fcc940ea4


In [12]:
# Cell 4: Bad Summarizer
def bad_summarizer(inputs: dict):
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": f"Answer this in one short sentence: {inputs['question']}"
            }
        ]
    )
    return response.choices[0].message.content

client.evaluate(
    bad_summarizer,
    data=dataset,
    evaluators=[summary_score_evaluator],
    experiment_prefix="Bad Summarizer",
)


View the evaluation results for experiment: 'Bad Summarizer-d39e9e64' at:
https://smith.langchain.com/o/58237f5e-f0c5-4c78-b71c-186c54d72106/datasets/31ad4f5b-9dc5-4879-bf59-24dfd22e0aec/compare?selectedSessions=3d181ab9-96b9-41e1-8a10-1e158250a025




5it [00:09,  1.93s/it]


Unnamed: 0,inputs.question,outputs.output,error,reference.answer,feedback.summary_score,execution_time,example_id,id
0,Do you offer student discounts?,"I'm unable to offer student discounts, but I r...",,"Yes, we offer 20% off for students. Verify you...",3,0.979147,4dde21b5-4c14-473d-abbc-527cc2d64425,73126894-f4a6-45eb-9a67-7316fcdb2c33
1,Can I change my subscription plan?,"Yes, you can typically change your subscriptio...",,Yes! Go to Account Settings > Subscription and...,4,0.835669,76b1a3f0-860c-4c20-a2ab-2bde109a5dd0,f85c0d9e-a459-40ee-8b7c-3ffb225f842e
2,How long does shipping take?,Shipping duration varies based on the service ...,,Standard shipping takes 5-7 business days. Exp...,4,0.976566,5dd918e2-cd7c-4f8f-ae16-24cb202f3256,83f6d30d-c5e7-4dcc-8860-c5dc67d28971
3,What is your refund policy?,Our refund policy allows returns within 30 day...,,We offer a 30-day money-back guarantee. Contac...,4,1.055247,8df3cdfe-b57c-4aea-8f9a-ee5a7d2469d1,202aac79-9ea8-476d-a5d9-8d3befa69406
4,How do I reset my password?,"Follow the ""Forgot Password"" link on the login...",,"Go to the login page, click 'Forgot Password',...",5,0.997035,c13f215e-cc0b-4861-b3a4-f2fcf9e3fb4f,365133c8-0e9c-4c4c-845f-d5d07d51e96c


In [13]:
# Cell 5: Impartial Judge & Pairwise Prompts
JUDGE_SYSTEM_PROMPT = """
You are an impartial judge evaluating two chatbot answers for a customer support question.
Consider helpfulness, relevance, accuracy, conciseness, and completeness.
Select the better answer and briefly explain why.
"""

JUDGE_HUMAN_PROMPT = """
Customer Support Question: {question}

[Start of Assistant A's Answer]
{answer_a}
[End of Assistant A's Answer]

[Start of Assistant B's Answer]
{answer_b}
[End of Assistant B's Answer]
"""


In [14]:
from pydantic import BaseModel, Field

class Preference(BaseModel):
    preference: int = Field(description="""1 if Assistant A answer is better based upon the factors above.
2 if Assistant B answer is better based upon the factors above.
Output 0 if it is a tie.""")
    
def ranked_preference(inputs: dict, outputs: list[dict]) -> list:
    completion = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {   
                "role": "system",
                "content": JUDGE_SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": JUDGE_HUMAN_PROMPT.format(
                    transcript=inputs["transcript"],
                    answer_a=outputs[0].get("output", "N/A"),
                    answer_b=outputs[1].get("output", "N/A")
                )}
        ],
        response_format=Preference,
    )

    preference_score = completion.choices[0].message.parsed.preference

    if preference_score == 1:
        scores = [1, 0]
    elif preference_score == 2:
        scores = [0, 1]
    else:
        scores = [0, 0]
    return scores

In [16]:
from langsmith import evaluate

evaluate(
         ("Good Summarizer-7c013f2e", "Bad Summarizer-f300e14b"), 
         evaluators=[ranked_preference]
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/58237f5e-f0c5-4c78-b71c-186c54d72106/datasets/31ad4f5b-9dc5-4879-bf59-24dfd22e0aec/compare?selectedSessions=a0f3399c-21ea-4596-9483-4b4f3c7ea87a%2C95896916-b1db-490d-a639-ecdfcc63c73a&comparativeExperiment=aa395584-3564-4d93-81c6-cdc2f0e494b7




  0%|          | 0/5 [00:00<?, ?it/s]


KeyError: 'transcript'