In [1]:
import fastrepl

In [2]:
from IPython.display import clear_output

In [None]:
%env OPENAI_API_KEY=
%env DEEPINFRA_API_KEY=

In [4]:
from fastrepl.utils import map_number_range
from datasets import load_dataset

dataset = load_dataset("yelp_review_full", split="test")
dataset = dataset.shuffle(seed=12)
dataset = dataset.select(range(100))
dataset = dataset.rename_column("text", "sample")
dataset = dataset.map(
    lambda row: {
        "reference": map_number_range(row["label"], 0, 4, 0, 10),
        "sample": row["sample"],
    },
    remove_columns=["label"],
)

In [5]:
dataset

Dataset({
    features: ['sample', 'reference'],
    num_rows: 100
})

In [6]:
def print_metric(metric_name, predictions, references):
    metric = fastrepl.load_metric(metric_name)
    result = metric.run(predictions=predictions, references=references)
    print(f"{metric_name}: {result[metric_name]}")

In [7]:
clear_output(wait=True)

eval1 = fastrepl.SimpleEvaluator(
    node=fastrepl.LLMGradingHead(
        model="gpt-3.5-turbo-0613",
        context="You will get a input text from Yelp review.",
        number_from=0,
        number_to=10,
        position_debias_strategy="shuffle",
        references=[
            ("this is the best", "10"),
            ("this is the worst", "0"),
        ],
    )
)

result1 = fastrepl.local_runner(
    evaluator=eval1,
    dataset=dataset,
    output_feature="prediction",
).run()

print_metric("accuracy", result1["prediction"], result1["reference"])
print_metric("mse", result1["prediction"], result1["reference"])
print_metric("mae", result1["prediction"], result1["reference"])

result1.to_pandas()[:10]

Output()

accuracy: 0.63
mse: 2.5
mae: 0.95


Unnamed: 0,sample,reference,prediction
0,I recently had my make-up done by Jena for an ...,10.0,10.0
1,"Last two visits, very poor, rude service. Ver...",2.5,0.0
2,Five stars because it's a place were comfort f...,10.0,10.0
3,"Montreal is a great foodie town, and it has it...",5.0,7.5
4,Took my friend to try his first pho here. It w...,7.5,7.5
5,Waited 1 hour for food! Someone ordered Bened...,0.0,0.0
6,Pita jungle is pita jungle is pita jungle. The...,7.5,7.5
7,The only reason I am giving this place a 3 sta...,5.0,5.0
8,"Wendy's Noodle Cafe, located at the Western ed...",5.0,7.5
9,"I love Moe's, personally I like it better than...",5.0,7.5


In [9]:
clear_output(wait=True)

eval2 = fastrepl.SimpleEvaluator(
    node=fastrepl.LLMGradingHead(
        model="deepinfra/mistralai/Mistral-7B-Instruct-v0.1",
        context="You will get a input text from Yelp review.",
        number_from=0,
        number_to=10,
        position_debias_strategy="shuffle",
        references=[
            ("this is the best", "10"),
            ("this is the worst", "0"),
        ],
    )
)


result2 = fastrepl.local_runner(
    evaluator=eval2,
    dataset=dataset,
    output_feature="prediction",
).run()

print_metric("accuracy", result2["prediction"], result2["reference"])
print_metric("mse", result2["prediction"], result2["reference"])
print_metric("mae", result2["prediction"], result2["reference"])

result2.to_pandas()[:10]

Output()

accuracy: 0.45
mse: 4.0
mae: 1.45


Unnamed: 0,sample,reference,prediction
0,I recently had my make-up done by Jena for an ...,10.0,7.5
1,"Last two visits, very poor, rude service. Ver...",2.5,2.5
2,Five stars because it's a place were comfort f...,10.0,10.0
3,"Montreal is a great foodie town, and it has it...",5.0,7.5
4,Took my friend to try his first pho here. It w...,7.5,7.5
5,Waited 1 hour for food! Someone ordered Bened...,0.0,0.0
6,Pita jungle is pita jungle is pita jungle. The...,7.5,5.0
7,The only reason I am giving this place a 3 sta...,5.0,5.0
8,"Wendy's Noodle Cafe, located at the Western ed...",5.0,5.0
9,"I love Moe's, personally I like it better than...",5.0,7.5
