In [1]:
import json
from typing import List
from datasets import Dataset, load_dataset
from openai import OpenAI
from tqdm.auto import tqdm
import concurrent.futures

In [2]:
def evaluate_answer(
    instruction: str, answer: str, client: OpenAI
 ) -> dict:
    prompt = f"""
    You are an expert judge. Please evaluate the quality of a given answer to an instruction based on two criteria:
        
            1. Accuracy: How factually correct is the information presented in 
            the answer? You are a technical expert in this topic.

            2. Style: Is the tone and writing style appropriate for a blog post 
            or social media content? It should use simple but technical words 
            and avoid formal or academic language.

        Accuracy scale:
            1 (Poor): Contains factual errors or misleading information
            2 (Good): Mostly accurate with minor errors or omissions
            3 (Excellent): Highly accurate and comprehensive

        Style scale:
            1 (Poor): Too formal, uses some overly complex words
            2 (Good): Good balance of technical content and accessibility, but 
            still uses formal words and expressions
            3 (Excellent): Perfectly accessible language for blog/social media, 
            uses simple but precise technical terms when necessary
        
    Example of bad style: The Llama2 7B model constitutes a noteworthy progression in the field of artificial intelligence, serving as the successor to its predecessor, the original Llama architecture.
    Example of excellent style: Llama2 7B outperforms the original Llama model across multiple benchmarks.
    Instruction: {instruction}
    Answer: {answer}
    Provide your evaluation in JSON format with the following structure:
    {{
        "accuracy": {{
        "analysis": "...",
        "score": 0
        }},
        "style": {{
        "analysis": "...",
        "score": 0
        }}
    }}
    """

    completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant who evaluates answers based on accuracy and style. Provide your response in JSON format with a short analysis and score for each criterion.",
            },
                
            {"role": "user", "content": prompt},
        ],
        response_format={"type": "json_object"},
        max_tokens=1000,
        temperature=0.8,
    )

    return json.loads(completion.choices[0].message.content)

def evaluate_batch(batch, start_index):
    client = OpenAI()
    return [
        (i, evaluate_answer(instr, ans, client))
        for i, (instr, ans) in enumerate(batch, start=start_index)
    ]


In [3]:
def evaluate_answers(model_id: str, num_threads: int = 10, batch_size: int = 5) -> Dataset:
    dataset = load_dataset(f"SkillRipper/{model_id.split('/')[-1]}-results", split="all")
    batches = [(i, list(zip(dataset["instruction"][i:i+batch_size], dataset["answers"][i:i+batch_size]))) for i in range(0, len(dataset), batch_size)]

    evaluations = [None] * len(dataset)
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(evaluate_batch, batch, start_index) for start_index, batch in batches]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            for index, evaluation in future.result():
                evaluations[index] = evaluation

    if 'evaluation' in dataset.column_names:
        dataset = dataset.remove_columns(['evaluation'])
    dataset = dataset.add_column("evaluation", evaluations)

    accuracy_scores = []
    style_scores = []

    for evaluation in dataset['evaluation']:
        # print(evaluation)
        try:
            eval_dict = json.loads(evaluation) if isinstance(evaluation, str) else evaluation
            accuracy_score = eval_dict['accuracy']['score']
            style_score = eval_dict['style']['score']
            accuracy_scores.append(accuracy_score)
            style_scores.append(style_score)
        except (json.JSONDecodeError, KeyError, TypeError):
            accuracy_scores.append(None)
            style_scores.append(None)
        
    if 'accuracy' in dataset.column_names:
        dataset = dataset.remove_columns(['accuracy'])
    dataset = dataset.add_column('accuracy', accuracy_scores)
    if 'style' in dataset.column_names:
        dataset = dataset.remove_columns(['style'])
    dataset = dataset.add_column('style', style_scores)

    # Compute and print model-wide average
    avg_accuracy = sum(score for score in accuracy_scores if score is not None) / len([s for s in accuracy_scores if s is not None])
    avg_style = sum(score for score in style_scores if score is not None) / len([s for s in style_scores if s is not None])

    print(f"{model_id.split('/')[-1]} - Accuracy: {avg_accuracy:.2f}")
    print(f"{model_id.split('/')[-1]} - Style: {avg_style:.2f}")

    dataset.push_to_hub(f"SkillRipper/{model_id.split('/')[-1]}-results")
    return dataset

In [4]:
model_ids = ['unsloth/Llama-3.2-3B-Instruct', 'SkillRipper/TwinLlama-3.2-3B-DPO', 'SkillRipper/TwinLlama-3.2-3B-Instruct']

In [5]:
for model_id in model_ids:
    evaluate_answers(model_id)

README.md:   0%|          | 0.00/380 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/300k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/211 [00:00<?, ? examples/s]

  0%|          | 0/43 [00:00<?, ?it/s]

Llama-3.2-3B-Instruct - Accuracy: 2.70
Llama-3.2-3B-Instruct - Style: 1.98


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/380 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/208k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/211 [00:00<?, ? examples/s]

  0%|          | 0/43 [00:00<?, ?it/s]

TwinLlama-3.2-3B-DPO - Accuracy: 2.38
TwinLlama-3.2-3B-DPO - Style: 2.02


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/380 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/209k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/211 [00:00<?, ? examples/s]

  0%|          | 0/43 [00:00<?, ?it/s]

TwinLlama-3.2-3B-Instruct - Accuracy: 2.41
TwinLlama-3.2-3B-Instruct - Style: 2.01


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]