In [1]:
import fastrepl

In [2]:
from IPython.display import clear_output

In [None]:
%env OPENAI_API_KEY=
%env NUM_THREADS=24

In [4]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full", split="test")
dataset = dataset.shuffle(seed=8)
dataset = dataset.select(range(50))
dataset = dataset.rename_column("text", "sample")
dataset = dataset.map(
    lambda row: {"reference": row["label"] + 1, "sample": row["sample"]},
    remove_columns=["label"],
)

In [5]:
dataset

Dataset({
    features: ['sample', 'reference'],
    num_rows: 50
})

In [5]:
def to_number(example):
    if example["prediction"] is None:
        print("None")
        example["prediction"] = 0
    example["prediction"] = float(example["prediction"])
    return example


def print_metric(metric_name, predictions, references):
    metric = fastrepl.load_metric(metric_name)
    result = metric.run(predictions=predictions, references=references)
    print(f"{metric_name}: {result[metric_name]}")

In [6]:
clear_output(wait=True)

eval1 = fastrepl.SimpleEvaluator(
    node=fastrepl.LLMGradingHead(
        model="gpt-3.5-turbo",
        context="You will get a input text from Yelp review. Grade user's satisfaction from 1 to 5.",
        number_from=1,
        number_to=5,
        position_debias_strategy="shuffle",
    )
)

result1 = fastrepl.local_runner(
    evaluator=eval1,
    dataset=dataset,
    output_feature="prediction",
).run()
result1 = result1.map(to_number)

print_metric("accuracy", result1["prediction"], result1["reference"])
print_metric("mse", result1["prediction"], result1["reference"])
print_metric("mae", result1["prediction"], result1["reference"])

result1.to_pandas()[:10]

Output()

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

accuracy: 0.62
mse: 0.44
mae: 0.4


Unnamed: 0,sample,reference,prediction
0,Stayed at the Wm Penn down the street for a we...,5,5.0
1,We tried out the lunch specials and found them...,4,4.0
2,Should have known better than to eat in a plac...,1,2.0
3,This place has sure changed...and not for the ...,2,3.0
4,I've been bringing my son to the owner Michael...,5,5.0
5,The trip to the location takes two busses and ...,1,1.0
6,This restaurant was suggested to me by a frien...,4,5.0
7,This place is unique because you are sitting o...,3,3.0
8,"Whew, $10 for a whiskey ginger ale and $18 for...",3,4.0
9,"Loove me some Pei Wei! Boyfriend isn't a fan, ...",4,5.0


In [6]:
clear_output(wait=True)

# simple references + COT
eval2 = fastrepl.SimpleEvaluator(
    node=fastrepl.LLMGradingHeadCOT(
        model="gpt-3.5-turbo",
        context="You will get a input text from Yelp review. Grade user's satisfaction from 1 to 5.",
        number_from=1,
        number_to=5,
        position_debias_strategy="shuffle",
    )
)

result2 = fastrepl.local_runner(
    evaluator=eval2,
    dataset=dataset,
    output_feature="prediction",
).run()
result2 = result2.map(to_number)

print_metric("accuracy", result2["prediction"], result2["reference"])
print_metric("mse", result2["prediction"], result2["reference"])
print_metric("mae", result2["prediction"], result2["reference"])

result2.to_pandas()[:10]

Output()

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

accuracy: 0.74
mse: 0.25
mae: 0.26


Unnamed: 0,sample,reference,prediction
0,Stayed at the Wm Penn down the street for a we...,5,4.0
1,We tried out the lunch specials and found them...,4,4.0
2,Should have known better than to eat in a plac...,1,1.0
3,This place has sure changed...and not for the ...,2,2.0
4,I've been bringing my son to the owner Michael...,5,5.0
5,The trip to the location takes two busses and ...,1,1.0
6,This restaurant was suggested to me by a frien...,4,4.0
7,This place is unique because you are sitting o...,3,3.0
8,"Whew, $10 for a whiskey ginger ale and $18 for...",3,3.0
9,"Loove me some Pei Wei! Boyfriend isn't a fan, ...",4,5.0
