In [None]:
#!pip install openai
#!pip install tabulate
#!pip install python-dotenv

In [None]:
import os
import json
import time

from openai import OpenAI
from tabulate import tabulate
from dotenv import load_dotenv

In [None]:
# Load environment variables from .env file
load_dotenv()

# Access the environment variables
api_key = os.getenv("API_KEY")

# Set up the OpenAI API client
client = OpenAI(api_key=api_key)

# OpenAI model labels and names
models = [
    ["GPT-3.5-Turbo", "gpt-3.5-turbo"],
    #["GPT-4", "gpt-4"],
    #["GPT-4-Turbo", "gpt-4-turbo"],
    #["GPT-4o", "gpt-4o"]
]

In [None]:
# Load the test cases from the JSON file
with open("test_cases.json", "r") as file:
    test_cases = json.load(file)

# Load the classifier prompt from the file
with open("../classifier.prompt", "r") as file:
    classifier_prompt = file.read()

# Initialize variable to store the overall results
overall_results = []

In [None]:
for model_label, model_name in models:

    # Initialize variables to store the results for the current model
    total_tests = len(test_cases)
    correct_predictions = 0
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0
    total_time = 0
    results = []

    # Run the test cases for the current model
    for test_case in test_cases:
        statement = test_case["statement"]
        expected_classification = test_case["expected_classification"]

        # Replace the placeholder in the classifier prompt with the statement
        final_prompt = classifier_prompt.replace("{{USER_MESSAGE}}", statement)

        start_time = time.time()

        chat_completion = client.chat.completions.create(
            model=model_name,
            temperature=0.0,
            max_tokens=1,
            top_p=0.8,
            messages=[{"role": "user", "content": final_prompt}]
        )
        end_time = time.time()
        inference_time = end_time - start_time
        total_time += inference_time

        # Extract the model's prediction
        prediction = chat_completion.choices[0].message.content.strip().lower()

        # Compare the prediction with the expected classification
        if prediction == expected_classification.lower():
            correct_predictions += 1
            if expected_classification.lower() == "true":
                true_positives += 1
            else:
                true_negatives += 1
        else:
            if expected_classification.lower() == "true":
                false_negatives += 1
            else:
                false_positives += 1

        # Store the result
        result = [statement, expected_classification, prediction, f"{inference_time:.2f}"]
        results.append(result)

    # Calculate the metrics for the current model
    accuracy = correct_predictions / total_tests
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    average_inference_time = total_time / total_tests

    # Print the results table for the current model in markdown format
    print(f"\n## Results for {model_label}")
    headers = ["Prompt", "Expected Output", "Prediction", "Inference Time"]
    table = tabulate(results, headers, tablefmt="pipe")
    print(table)

    # Print the metrics for the current model in markdown format
    print(f"\n### Metrics for {model_label}")
    print(f"- Accuracy: {accuracy:.2f}")
    print(f"- Precision: {precision:.2f}")
    print(f"- Recall: {recall:.2f}")
    print(f"- F1 Score: {f1_score:.2f}")
    print(f"- Average Inference Time (s): {average_inference_time:.2f} seconds")

    # Store the overall results for the current model
    overall_result = [model_label, accuracy, precision, recall, f1_score, average_inference_time]
    overall_results.append(overall_result)

In [None]:
# Print the overall results table
print("\nOverall Results:")
headers = ["Model Name", "Accuracy", "Precision", "Recall", "F1 Score", "Avg Time (s)"]
table = tabulate(overall_results, headers, tablefmt="pipe")
print(table)