# Exp014: Evaluation metrics
The goal of this experiment is to evaluate the three tasks on several quality metrics.

In [43]:
import nltk
nltk.download("punkt")
from nltk.util import ngrams
import os
import re
import numpy as np
from tqdm.notebook import tqdm

import sys
sys.path.append('../source')
import models
import helpers
import api

[nltk_data] Downloading package punkt to
[nltk_data]     /cluster/home/dglandorf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Let's start with distinctiveness. This metric should be calculated for the same constraints but for different contexts to make sure that the model does not

In [14]:
def calculate_distinct_n(texts, n=2):
    if isinstance(texts, str): texts = [texts]
    n_grams_per_text = [list(ngrams(nltk.word_tokenize(text), n)) for text in texts]
    n_grams = helpers.flatten_list_of_lists(n_grams_per_text)
    unique_n_grams = len(set(n_grams))
    total_n_grams = len(n_grams)
    return unique_n_grams / total_n_grams if total_n_grams > 0 else 0

Add an easy way to summarize skill detection

In [18]:
class GrammarDetection():
    def __init__(self, dir="corpus_training", skill_nrs=None):
        if skill_nrs is None: skill_nrs = [int(name.replace(".pth","")) for name in os.listdir(f"../models/{dir}")]
        self.classifiers = {nr: models.load_classifier(nr, 'corpus_training') for nr in skill_nrs}

    def score_texts(self, sentences):
        return {nr: models.probe_model(classifier, sentences) for nr, classifier in self.classifiers.items()}

    def constraint_satisfaction(self, text, constraints):
        sentences = nltk.sent_tokenize(text)
        hits = []
        for nr in constraints:
            outputs = models.probe_model(self.classifiers[nr], sentences)
            hits.append(sum(outputs[0]>0.5).item() / len(sentences))
        return hits

In [19]:
detector = GrammarDetection()

Other response quality metrics

In [1]:
gpt_metrics = {
    "Appropriateness": "Given the Context, evaluate from 1-5 the Response in terms of Appropriateness. Provide a single score and nothing else.",
    "Relevance": "Given the Context, evaluate from 1-5 the Response in terms of Relevance. Provide a single score and nothing else.",
    "Content Richness": "Given the Context, evaluate from 1-5 the Response in terms of Content Richness. Provide a single score and nothing else.",
    "Grammatical Correctness": "Evaluate from 1-5 the Response in terms of Grammatical Correctness. Provide a single score and nothing else.",
}

In [2]:
def completion_to_score(message):
    matches = re.findall(r"\b[1-5]\b", message)
    if not matches:
        return -1
    return np.mean([float(m) for m in matches])

In [38]:
def get_response_quality(context, responses):
    preds = {metric: [] for metric in gpt_metrics.keys()}
    for res in tqdm(responses, desc="Responses", leave=False):
        for metric, prompt in gpt_metrics.items():
            text_prompt = f"Context:{context}\nResponse:{res}"
            gpt_score = -1
            score_backoff = 0
            while gpt_score == -1 and score_backoff < 2:
                responses = api.get_openai_chat_completion(
                    model="gpt-3.5-turbo",
                    temperature=0.0,
                    max_tokens=20,
                    messages=[
                        {"role": "system", "content": prompt},
                        {"role": "user", "content": text_prompt},
                    ],
                )
                gpt_score = completion_to_score(responses[0])
                score_backoff += 1
            if gpt_score != -1:
                preds[metric].append(gpt_score)
            else:
                preds[metric].append(3)
    return preds

In [12]:
context = ["""
A: Hello!
B: How are you doing?
"""]
responses = ["""
I'm good. How about you?
""", """
No.
""", """
To be honest, not that good. My mother had a car crash yesterday.
"""]

get_response_quality(context, responses)

100%|█████████████| 3/3 [00:09<00:00,  3.09s/it]


{'Appropriateness': [4.0, 1.0, 2.0],
 'Relevance': [4.0, 1.0, 2.0],
 'Content Richness': [2.0, 1.0, 4.0],
 'Grammatical Correctness': [5.0, 1.0, 4.0]}

Example evaluation

In [23]:
context = """
A: Hey brother, do you already have plans for the summer?
"""
responses = ["I would like to invite you to dinner tonight.", "I would like to invite our entire family to my wedding."]

"""
Input: context, positive constraints, negative constraints and list of responses
Output: Distinctiveness of responses, For each response: Positive satisfaction per sentence, Negative satisfaction per sentence, Quality measures
"""
def evaluate_responses(context, responses, positive_skills, negative_skills):
    distinct_2 = calculate_distinct_n(responses)
    positive_satisfaction = [detector.constraint_satisfaction(response, positive_skills) for response in responses]
    negative_constraints = [detector.constraint_satisfaction(response, negative_skills) for response in responses]
    qualities = get_response_quality(context, responses)
    return {"Distinctiveness": distinct_2, "positive_constraints": positive_satisfaction, "negative_constraints": negative_constraints, **qualities}

evaluate_responses(context, responses, [616, 617], [623, 624])

100%|█████████████| 2/2 [00:04<00:00,  2.19s/it]


{'distinctiveness': 0.8,
 'positive_constraints': [[1.0, 1.0], [1.0, 1.0]],
 'negative_constraints': [[0.0, 0.0], [0.0, 0.0]],
 'Appropriateness': [2.0, 4.0],
 'Relevance': [2.0, 2.0],
 'Content Richness': [2.0, 4.0],
 'Grammatical Correctness': [2.0, 4.0]}

In [40]:
def multiple_constraints(responses_list, skills_list):
    return [[detector.constraint_satisfaction(response, skills) for response in responses] for responses, skills in zip(responses_list, skills_list)]
"""
Input: lists of response sets to evaluate
Output: dict with list of evaluations
"""
def evaluate_responses_list(contexts, responses_list, positive_skills_list, negative_skills_list):
    distinct_2 = [calculate_distinct_n(responses) for responses in responses_list]
    positive_satisfaction = multiple_constraints(responses_list, positive_skills_list)
    negative_constraints = multiple_constraints(responses_list, negative_skills_list)
    qualities = [get_response_quality(context, responses) for context, responses in tqdm(zip(contexts, responses_list), total=len(contexts), desc="Contexts")]
    return {"Distinctiveness": distinct_2,
            "positive_constraints": positive_satisfaction,
            "negative_constraints": negative_constraints,
            **{key: [d[key] for d in qualities] for key in qualities[0]}
    }

In [44]:
contexts = ["""A: Can I take the subway to get there?
B: Yes, but that will probably take about half an hour. You should just take a taix.
A: Won't that be expensive?
B: """, """A: Sir, I am very glad to tell that we have successfully registered the trademark for our new product. It is the time to think of some effective promoting strategies. We are beginning to get more attention from overseas.
B: Well done, Fred. Do you know something useful for our promotion for our I-series?
A: OK, Let me see. I suppose we must strengthen our promotion, because our brand is still new to some consumers. Maybe we should start our advertising program with our local and overseas distributors simultaneously, because they stand on a better position for selecting the best ways to advertise in market places. Besides, the advertisement fund can encourage them to spend more attention on advertising our products.
B: """]
responses = [["Not necessarily, you can always share a ride with a friend or take a cheaper taxi service. Would you like me to look up some options for you?", "Yesterday I took the tram. Would you like me to look up the schedule?"], ["That sounds like a good plan. Would you be willing to lead this initiative and work closely with our distributors to ensure the success of our advertising campaign? And could you please report back to me with the progress and any feedback from the distributors?", "People will buy it anyways, I'm sure!"]]

In [45]:
evaluate_responses_list(contexts, responses, [[616, 617, 621],[616, 617]], [[623, 624, 628],[623, 624]])

Contexts:   0%|          | 0/2 [00:00<?, ?it/s]

Responses:   0%|          | 0/2 [00:00<?, ?it/s]

Responses:   0%|          | 0/2 [00:00<?, ?it/s]

{'Distinctiveness': [0.8444444444444444, 1.0],
 'positive_constraints': [[[0.0, 0.0, 0.5], [0.0, 0.0, 0.5]],
  [[0.0, 0.0], [0.0, 0.0]]],
 'negative_constraints': [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
  [[0.0, 0.0], [0.0, 0.0]]],
 'Appropriateness': [[4.0, 2.0], [4.0, 2.0]],
 'Relevance': [[4.0, 2.0], [4.0, 2.0]],
 'Content Richness': [[4.0, 2.0], [4.0, 2.0]],
 'Grammatical Correctness': [[5.0, 4.0], [4.0, 4.0]]}