In [None]:
import pandas as pd
import numpy as np
from openai import OpenAI
from pathlib import Path
import json
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
API_KEY = "[INSERT HERE]"

In [None]:
#%env OPENAI_API_KEY=INSERT

In [None]:
fields = {
    "reuters": "text",
    "spookyauthor": "text",
    "trustpilot": "text",
    "yelp": "review",
    "reddit-mental-health":"text"
}

In [None]:
util = {
    "trustpilot": "sentiment",
    "yelp": "sentiment",
}

In [None]:
labels = {
    "reuters": "author",
    "spookyauthor": "author",
    "trustpilot": "gender",
    "yelp": "user_id",
    "reddit-mental-health":"author_id"
}

# GEval with Deepeval

In [None]:
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval
from deepeval.dataset import EvaluationDataset
from deepeval.evaluate import DisplayConfig, evaluate, ErrorConfig

In [None]:
log = {}

In [None]:
fluency = GEval(
    name="Fluency",
    criteria="Measure how smoothly the text reads, focusing on grammar and syntax.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
    strict_mode=False,
    model="gpt-4o-mini"
)

In [None]:
consistency = GEval(
    name="Consistency",
    criteria="Ensures the text maintains a uniform style and tone throughout.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
    strict_mode=False,
    model="gpt-4o-mini"
)

In [None]:
clarity = GEval(
    name="Clarity",
    criteria="Evaluates how easily the actual output can be understood by the reader.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
    strict_mode=False,
    model="gpt-4o-mini"
)

In [None]:
conciseness = GEval(
    name="Conciseness",
    criteria="Assesses whether the text is free of unnecessary words or details.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
    strict_mode=False,
    model="gpt-4o-mini"
)

In [None]:
repetitiveness = GEval(
    name="Repetitiveness",
    criteria="Checks for redundancy or repeated information in the text.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
    strict_mode=False,
    model="gpt-4o-mini"
)

In [None]:
display = DisplayConfig(print_results=False, show_indicator=True, verbose_mode=False)

In [None]:
error = ErrorConfig(ignore_errors=True)

In [None]:
metric_names = ["Fluency", "Consistency", "Clarity", "Conciseness", "Repetitiveness"]

In [None]:
# baseline
for f in fields:
    print(f)
    if f in log:
        continue
    
    data = pd.read_csv("data/datasets/{}.csv".format(f)).sample(n=100, random_state=42)
    cases = []
    for x in data[fields[f]].to_list():
        cases.append(LLMTestCase(input=x, actual_output=x))

    dataset = EvaluationDataset(test_cases=cases)
    res = evaluate(test_cases=dataset, metrics=[fluency, consistency, clarity, conciseness, repetitiveness], display_config=display, error_config=error)

    results = {m: [] for m in metric_names}
    
    for r in res.test_results:
        for m in r.metrics_data:
            results[m.name.split()[0]].append(m.score)

    averages = {}
    for r in results:
        nums = [x for x in results[r] if pd.isnull(x) == False]
        if len(nums) > 0:
            averages[r] = round(np.mean(nums), 3)
        else:
            averages[r] = 0

    log[f] = averages
    print(averages)

    with open("data/geval_baseline.json", 'w') as out:
        json.dump(log, out, indent=3)

In [None]:
for file in Path("data/").glob("*.csv"):
    print(file.stem)
    if file.stem in log:
        continue
    
    name = file.stem.split("_")[0]
    orig = pd.read_csv("data/datasets/{}.csv".format(name))
    priv = pd.read_csv(file).dropna().sample(n=100, random_state=42)
    orig = orig.iloc[priv.index]

    field = fields[name]
    cases = []
    for x, y in zip(orig[field].to_list(), priv[field].to_list()):
        cases.append(LLMTestCase(input=x, actual_output=y))

    dataset = EvaluationDataset(test_cases=cases)
    res = evaluate(test_cases=dataset, metrics=[fluency, consistency, clarity, conciseness, repetitiveness], display_config=display, error_config=error)

    results = {m: [] for m in metric_names}
    
    for r in res.test_results:
        for m in r.metrics_data:
            results[m.name.split()[0]].append(m.score)

    averages = {}
    for r in results:
        nums = [x for x in results[r] if pd.isnull(x) == False]
        if len(nums) > 0:
            averages[r] = round(np.mean(nums), 3)
        else:
            averages[r] = 0

    log[file.stem] = averages
    print(averages)

    with open("data/geval.json", 'w') as out:
        json.dump(log, out, indent=3)