## Evaluation of OpenAI's fine-tuned models

This notebook evaluates the fine-tuned models from OpenAI.

In [29]:
import os
import pandas as pd

data_dir = os.path.join('..', '..', 'data')

input_dir = os.path.join(data_dir, 'openai_fine_tune')

model_name = 'awesome_elion'
training_file = os.path.join(input_dir, f'{model_name}_train.jsonl')
validation_file = os.path.join(input_dir, f'{model_name}_test.jsonl')

evaluation_dir = os.path.join('..', '..', 'reports', 'openai_fine_tune')
os.makedirs(evaluation_dir, exist_ok=True)

predictions_file = os.path.join(evaluation_dir, f'{model_name}_predictions.csv')
evaluation_file = os.path.join(evaluation_dir, f'{model_name}_evaluation.csv')

In [None]:
# Trained fine-tuned models to evaluate
models = [
    'ada:ft-personal:awesome-elion-2023-07-25-09-57-12',
    'babbage:ft-personal:awesome-elion-2023-07-25-18-34-52',
    'curie:ft-personal:awesome-elion-2023-07-25-10-01-39',
    'davinci:ft-personal:awesome-elion-2023-07-25-18-52-58',
]

# Temperature values to evaluate
temperatures = [0.0, 0.25, 0.5, 0.75, 1.0]

In [None]:
train = pd.read_json(training_file, lines=True)
test = pd.read_json(validation_file, lines=True)

In [None]:
import openai
import json


def get_completion(ft_model: str, prompt: str, temperature: float = 0.0):
    # Check if prompt ends with "\n\n###\n\n"
    if prompt[-8:] != "\n\n###\n\n":
        prompt += "\n\n###\n\n"

    response = openai.Completion.create(
        model=ft_model,
        prompt=prompt,
        max_tokens=1500,
        temperature=temperature,
        stop=["\n"],
    )

    completion = response['choices'][0]['text']
    if completion == "":
        completion = "[]"

    data = []

    try:
        data = json.loads(completion)
    except json.decoder.JSONDecodeError:
        print(f"Error: {completion}")

    return data

In [None]:
import tqdm

evaluation_input = {
    "prompt": [],
    "ground_truth": [],
    "temperature": [],
}

for model in models:
    evaluation_input[model] = []

for inx, row in tqdm.tqdm(test.iterrows(), total=len(test)):
    test_prompt = row['prompt']
    ground_truth = json.loads(row['completion'])

    for temperature in temperatures:
        evaluation_input['prompt'].append(test_prompt)
        evaluation_input['ground_truth'].append(ground_truth)
        evaluation_input['temperature'].append(temperature)

        gt_set = set(ground_truth)

        for model in models:
            model_completion = get_completion(model, test_prompt, temperature)
            completion_set = set(model_completion)

            evaluation_input[model].append(model_completion)

In [31]:
evaluation_df = pd.DataFrame(evaluation_input)
evaluation_df.to_csv(predictions_file, index=False)

evaluation_df

Unnamed: 0,prompt,ground_truth,temperature,ada:ft-personal:awesome-elion-2023-07-25-09-57-12,babbage:ft-personal:awesome-elion-2023-07-25-18-34-52,curie:ft-personal:awesome-elion-2023-07-25-10-01-39,davinci:ft-personal:awesome-elion-2023-07-25-18-52-58
0,View-Centric Context Modeling to Foster the En...,[Architectural Aspects],0.00,[Architecture Analysis Method],[Architecture Analysis Method],[Architecture Analysis Method],[Architecture Analysis Method]
1,View-Centric Context Modeling to Foster the En...,[Architectural Aspects],0.25,[Architecture Analysis Method],[Architecture Analysis Method],[Architecture Description Language],[Architecture Analysis Method]
2,View-Centric Context Modeling to Foster the En...,[Architectural Aspects],0.50,[Architecture Analysis Method],[Architecture Analysis Method],[Architecture Design Method],[Architecture Analysis Method]
3,View-Centric Context Modeling to Foster the En...,[Architectural Aspects],0.75,[Architecture Analysis Method],[Architecture Analysis Method],[Architecture Decision Making],[Architecture Analysis Method]
4,View-Centric Context Modeling to Foster the En...,[Architectural Aspects],1.00,[Architecture Description Language],[Architecture Design Method],"[Architecture Description, Architecture Analys...",[Architecture Description Language]
...,...,...,...,...,...,...,...
150,A Taxonomy of Blockchain-Based Systems for Arc...,[Architecture Decision Making],0.00,[Architecture Design Method],[Architecture Analysis Method],[Architecture Design Method],[Architecture Analysis Method]
151,A Taxonomy of Blockchain-Based Systems for Arc...,[Architecture Decision Making],0.25,[Architecture Design Method],[Architecture Analysis Method],[Architecture Design Method],[Architecture Analysis Method]
152,A Taxonomy of Blockchain-Based Systems for Arc...,[Architecture Decision Making],0.50,[Architecture Design Method],[Architecture Analysis Method],[Architecture Design Method],[Architecture Analysis Method]
153,A Taxonomy of Blockchain-Based Systems for Arc...,[Architecture Decision Making],0.75,[Architecture Design Method],"[Architecture Description, Architecture Analys...",[Architecture Design Method],[Architecture Assessment Method]


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

evaluation_results = []

for model in models:
    for temperature in temperatures:
        focus_df = evaluation_df[(evaluation_df['temperature'] == temperature)]

        all_labels = set()
        for labels in focus_df["ground_truth"]:
            all_labels.update(labels)
        for labels in focus_df[model]:
            all_labels.update(labels)

        # Convert ground truth and predicted labels to binary arrays
        mlb = MultiLabelBinarizer(classes=list(all_labels))
        ground_truth_binary = mlb.fit_transform(focus_df["ground_truth"])
        predicted_binary = mlb.transform(focus_df[model])

        # Calculate accuracy score
        accuracy = accuracy_score(ground_truth_binary, predicted_binary)
        precision = precision_score(ground_truth_binary, predicted_binary, average='micro')
        recall = recall_score(ground_truth_binary, predicted_binary, average='micro')
        f1 = f1_score(ground_truth_binary, predicted_binary, average='micro')

        results = {
            "model": model,
            "temperature": temperature,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
        }

        evaluation_results.append(results)

In [27]:
# Create dataframe from evaluation results
evaluation_results_df = pd.DataFrame.from_records(evaluation_results)
evaluation_results_df.to_csv(evaluation_file, index=False)
evaluation_results_df

Unnamed: 0,model,temperature,accuracy,precision,recall,f1
0,ada:ft-personal:awesome-elion-2023-07-25-09-57-12,0.0,0.419355,0.451613,0.4375,0.444444
1,ada:ft-personal:awesome-elion-2023-07-25-09-57-12,0.25,0.451613,0.483871,0.46875,0.47619
2,ada:ft-personal:awesome-elion-2023-07-25-09-57-12,0.5,0.451613,0.5,0.5,0.5
3,ada:ft-personal:awesome-elion-2023-07-25-09-57-12,0.75,0.387097,0.40625,0.40625,0.40625
4,ada:ft-personal:awesome-elion-2023-07-25-09-57-12,1.0,0.225806,0.242424,0.25,0.246154
5,babbage:ft-personal:awesome-elion-2023-07-25-1...,0.0,0.290323,0.3125,0.3125,0.3125
6,babbage:ft-personal:awesome-elion-2023-07-25-1...,0.25,0.258065,0.272727,0.28125,0.276923
7,babbage:ft-personal:awesome-elion-2023-07-25-1...,0.5,0.290323,0.3125,0.3125,0.3125
8,babbage:ft-personal:awesome-elion-2023-07-25-1...,0.75,0.290323,0.272727,0.28125,0.276923
9,babbage:ft-personal:awesome-elion-2023-07-25-1...,1.0,0.193548,0.27027,0.3125,0.289855
