## Evaluation of OpenAI's fine-tuned models

This notebook evaluates the fine-tuned models from OpenAI. The difference to the `evaluation_classification.ipynb` notebook is that response is in another JSON format that contains the research object as a list of strings and taxonomy explanations. 

The evaluation is done by comparing the research objects of the ground truth and the predicted response.

In [None]:
import os
import pandas as pd

data_dir = os.path.join('..', '..', 'data')

input_dir = os.path.join(data_dir, 'openai_fine_tune')

model_name = 'nostalgic_montalcini'
training_file = os.path.join(input_dir, f'{model_name}_train.jsonl')
validation_file = os.path.join(input_dir, f'{model_name}_test.jsonl')

evaluation_dir = os.path.join('..', '..', 'reports', 'openai_fine_tune')
os.makedirs(evaluation_dir, exist_ok=True)

predictions_file = os.path.join(evaluation_dir, f'{model_name}_predictions.csv')
evaluation_file = os.path.join(evaluation_dir, f'{model_name}_evaluation.csv')

In [None]:
# Trained fine-tuned models to evaluate
models = [
    'ada:ft-personal:nostalgic-montalcini-2023-07-31-09-24-03',
    'curie:ft-personal:nostalgic-montalcini-2023-07-31-12-04-54',
]

# Temperature values to evaluate
temperatures = [0.0, 0.25, 0.5, 0.75, 1.0]

In [None]:
train = pd.read_json(training_file, lines=True)
test = pd.read_json(validation_file, lines=True)

In [None]:
import openai
import json


def get_completion(ft_model: str, prompt: str, temperature: float = 0.0):
    # Check if prompt ends with "\n\n###\n\n"
    if prompt[-8:] != "\n\n###\n\n":
        prompt += "\n\n###\n\n"

    response = openai.Completion.create(
        model=ft_model,
        prompt=prompt,
        max_tokens=1500,
        temperature=temperature,
        stop=["\n"],
    )

    completion = response['choices'][0]['text']
    if completion == "":
        completion = "{}"

    data = {}
    try:
        data = json.loads(completion)
    except json.decoder.JSONDecodeError:
        print(f"Error: {completion}")

    return data

In [None]:
import tqdm

evaluation_input = {
    "prompt": [],
    "ground_truth": [],
    "temperature": [],
}

for model in models:
    evaluation_input[model] = []

for inx, row in tqdm.tqdm(test.iterrows(), total=len(test)):
    test_prompt = row['prompt']
    ground_truth = json.loads(row['completion'])

    for temperature in temperatures:
        evaluation_input['prompt'].append(test_prompt)
        evaluation_input['ground_truth'].append(ground_truth["Research Object"])
        evaluation_input['temperature'].append(temperature)

        gt_set = set(ground_truth)

        for model in models:
            model_completion = get_completion(model, test_prompt, temperature)
            completion_set = set(model_completion.get("Research Object", []))

            evaluation_input[model].append(completion_set)

In [None]:
evaluation_df = pd.DataFrame(evaluation_input)
evaluation_df.to_csv(predictions_file, index=False)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

evaluation_results = []

for model in models:
    for temperature in temperatures:
        focus_df = evaluation_df[(evaluation_df['temperature'] == temperature)]

        all_labels = set()
        for labels in focus_df["ground_truth"]:
            all_labels.update(labels)
        for labels in focus_df[model]:
            all_labels.update(labels)

        # Convert ground truth and predicted labels to binary arrays
        mlb = MultiLabelBinarizer(classes=list(all_labels))
        ground_truth_binary = mlb.fit_transform(focus_df["ground_truth"])
        predicted_binary = mlb.transform(focus_df[model])

        # Calculate accuracy score
        accuracy = accuracy_score(ground_truth_binary, predicted_binary)
        precision = precision_score(ground_truth_binary, predicted_binary, average='micro')
        recall = recall_score(ground_truth_binary, predicted_binary, average='micro')
        f1 = f1_score(ground_truth_binary, predicted_binary, average='micro')

        results = {
            "model": model,
            "temperature": temperature,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
        }

        evaluation_results.append(results)

In [None]:
# Create dataframe from evaluation results
evaluation_results_df = pd.DataFrame.from_records(evaluation_results)
evaluation_results_df.to_csv(evaluation_file, index=False)
evaluation_results_df