In [1]:
import fastrepl

In [2]:
import re
from datasets import load_dataset

ds = load_dataset("daily_dialog", split="test")
ds = ds.shuffle(4)
ds = ds.select(range(50))


def clean(text):
    return re.sub(r"\s+([,.'!?])", r"\1", text.strip())


def get_input(row):
    msgs = [clean(msg) for msg in row["dialog"]]
    row["sample"] = "\n".join(msgs)  # `SimpleEvaluator` expect `sample` column

    return row


ds = ds.map(get_input, remove_columns=["dialog", "act", "emotion"])
ds

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset({
    features: ['sample'],
    num_rows: 50
})

In [3]:
evaluator = fastrepl.SimpleEvaluator(
    node=fastrepl.LLMClassificationHead(
        model="gpt-3.5-turbo",
        context="You will receive casual conversation between two people.",
        labels={
            "FUN": "at least one of the two people try to be funny and entertain.",
            "NOT_FUN": "given conversation lacks humor or entertainment value.",
        },
    ),
)

In [5]:
result = fastrepl.local_runner(evaluator=evaluator, dataset=ds).run(num=4)
result.to_pandas()[:10]

Output()

Unnamed: 0,sample,result
0,"Would you like to take a look at the menu, sir...","[NOT_FUN, NOT_FUN, NOT_FUN, NOT_FUN]"
1,Help! Help!\nWhat's the matter?,"[NOT_FUN, FUN, FUN, NOT_FUN]"
2,"Whatever we do, we should do it above board.\n...","[FUN, FUN, FUN, FUN]"
3,"May I see your passport, please?\nCertainly. H...","[NOT_FUN, NOT_FUN, FUN, FUN]"
4,We're thinking about going to America.\nHave y...,"[FUN, NOT_FUN, FUN, NOT_FUN]"
5,"Do you believe in UFOs?\nOf course, they are o...","[FUN, NOT_FUN, FUN, FUN]"
6,What do you think about the equipment in our c...,"[FUN, NOT_FUN, NOT_FUN, FUN]"
7,How was your business trip?\nGreat - they wine...,"[FUN, FUN, FUN, NOT_FUN]"
8,"Hello, Parker. How ’ s everything?\nCan ’ t co...","[NOT_FUN, FUN, NOT_FUN, NOT_FUN]"
9,Our toner cartridges are already out of ink......,"[FUN, NOT_FUN, FUN, FUN]"


In [6]:
fastrepl.Analyzer(result).run(mode="kappa")

{'kappa': -0.009383345174804679}

In [7]:
def metric(result):
    f = result.count("FUN")
    nf = result.count("NOT_FUN")
    return f / (f + nf)


for r in list(zip(*result["result"])):
    print(metric(r))

0.52
0.52
0.52
0.4
