# Exp014: Evaluation metrics
The goal of this experiment is to evaluate the three tasks on several quality metrics.

In [71]:
import nltk
nltk.download("punkt")
from nltk.util import ngrams
import os

import sys
sys.path.append('../source')
import models
import helpers

[nltk_data] Downloading package punkt to
[nltk_data]     /cluster/home/dglandorf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Let's start with distinctiveness. This metric should be calculated for the same constraints but for different contexts to make sure that the model does not

In [69]:
def calculate_distinct_n(texts, n=2):
    if isinstance(texts, str): texts = [texts]
    n_grams_per_text = [list(ngrams(nltk.word_tokenize(text), n)) for text in texts]
    n_grams = helpers.flatten_list_of_lists(n_grams_per_text)
    unique_n_grams = len(set(n_grams))
    total_n_grams = len(n_grams)
    return unique_n_grams / total_n_grams if total_n_grams > 0 else 0

Add an easy way to summarize skill detection

In [65]:
class GrammarDetection():
    def __init__(self, dir="corpus_training", skill_nrs=None):
        if skill_nrs is None: skill_nrs = [int(name.replace(".pth","")) for name in os.listdir(f"../models/{dir}")]
        self.classifiers = {nr: models.load_classifier(nr, 'corpus_training') for nr in skill_nrs}

    def score_texts(self, sentences):
        return {nr: models.probe_model(classifier, sentences) for nr, classifier in self.classifiers.items()}

    def constraint_satisfaction(self, sentences, constraints):
        hits = []
        for nr in constraints:
            outputs = models.probe_model(self.classifiers[nr], sentences)
            hits.append(sum(outputs[0]>0.5) / len(sentences))
        return sum(hits) / len(hits)

In [66]:
detector = GrammarDetection()

Example evaluation

In [74]:
sentences = ["I would like to invite you to dinner tonight.", "I would like to invite our entire family to the wedding."]
positive_constraints = detector.constraint_satisfaction(sentences, [616, 617])
negative_constraints = detector.constraint_satisfaction(sentences, [623, 624])
distinctiveness = calculate_distinct_n(sentences)

print(f"Positive: {positive_constraints}, Negative: {negative_constraints}, Distinct-2: {distinctiveness}")

Positive: 0.75, Negative: 0.0, Distinct-2: 0.8
