In [None]:
#!pip install ollama
#!pip install tabulate
#!pip install python-dotenv

In [None]:
import os
import json
import time

from ollama import Client
from tabulate import tabulate
from dotenv import load_dotenv

In [None]:
# Load environment variables from .env file
load_dotenv()

# Access the environment variables
ollama_server = os.getenv("OLLAMA_SERVER")

# Set the Ollama address
client = Client(host=ollama_server, timeout=60)

# Model names and sizes
models = [
    #["aya:8b", 8],
    #["llama3-chatqa:latest", 8],
    #["llama3-gradient:latest", 8],
    #["llama3:8b", 8],
    #["falcon:7b", 7],
    #["gemma:latest", 7],
    #["llama2:latest", 7],
    #["llava:7b", 7],
    #["mistral:latest", 7],
    #["nous-hermes:7b", 7],
    #["openchat:latest", 7],
    #["orca-mini:7b", 7],
    #["orca2:7b", 7],
    #["qwen2:7b", 7],
    #["samantha-mistral:latest", 7],
    #["wizardlm2:latest", 7],
    #["zephyr:latest", 7],
    #["phi3:latest", 3.8],
    #["orca-mini:3b", 3],
    #["phi:latest", 2.7],
    #["gemma:2b", 2],
    #["stablelm2:1.6b", 1.6],
    ["qwen2:1.5b", 1.5],
    #["tinydolphin:latest", 1.1],
    #["tinyllama:latest", 1.1],
    #["qwen2:0.5b", 0.5]
]

In [None]:
# Load the test cases from the JSON file
with open("test_cases.json", "r") as file:
    test_cases = json.load(file)

# Load the classifier prompt from the file
with open("../classifier.prompt", "r") as file:
    classifier_prompt = file.read()

# Initialize variable to store the overall results
overall_results = []

In [None]:
for model_name, model_size in models:

    # Initialize variables to store the results for the current model
    total_tests = len(test_cases)
    correct_predictions = 0
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0
    total_time = 0
    failed_tests = 0
    results = []

    # Run the test cases for the current model
    for test_case in test_cases:
        statement = test_case["statement"]
        expected_classification = test_case["expected_classification"]

        # Replace the placeholder in the classifier prompt with the statement
        final_prompt = classifier_prompt.replace("{{USER_MESSAGE}}", statement)

        start_time = time.time()
        try:
            response = client.chat(model=model_name,
                messages=[{
                    'role': 'user',
                    'content': final_prompt
                }],
                options = {
                    'num_predict': 1,
                    'temperature': 0.0,
                    'top_k': 2,
                    'top_p': 0.8
                })
            end_time = time.time()
            inference_time = end_time - start_time
            total_time += inference_time

            # Extract the model's prediction
            prediction = response['message']['content'].strip().lower()

            # Compare the prediction with the expected classification
            if prediction == expected_classification.lower():
                correct_predictions += 1
                if expected_classification.lower() == "true":
                    true_positives += 1
                else:
                    true_negatives += 1
            else:
                if expected_classification.lower() == "true":
                    false_negatives += 1
                else:
                    false_positives += 1

            # Store the result
            result = [statement, expected_classification, prediction, f"{inference_time:.2f}"]
            results.append(result)
        except:
            failed_tests += 1
            result = [statement, expected_classification, "Failed", "-"]
            results.append(result)

    # Calculate the metrics for the current model
    accuracy = correct_predictions / (total_tests - failed_tests) if (total_tests - failed_tests) > 0 else 0
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    average_time = total_time / (total_tests - failed_tests) if (total_tests - failed_tests) > 0 else 0

    # Print the results table for the current model in markdown format
    print(f"\n## Results for {model_name} (Size: {model_size} billion parameters)")
    headers = ["Prompt", "Expected Output", "Prediction", "Inference Time"]
    table = tabulate(results, headers, tablefmt="pipe")
    print(table)

    # Print the metrics for the current model in markdown format
    print(f"\n### Metrics for {model_name}")
    print(f"- Accuracy: {accuracy:.2f}")
    print(f"- Precision: {precision:.2f}")
    print(f"- Recall: {recall:.2f}")
    print(f"- F1 Score: {f1_score:.2f}")
    print(f"- Average Inference Time (s): {average_time:.2f} seconds")
    print(f"- Failure Rate (%): {failed_tests}")

    # Store the overall results for the current model
    overall_result = [model_name, model_size, accuracy, precision, recall, f1_score, average_time, failed_tests]
    overall_results.append(overall_result)

In [None]:
# Print the overall results table in markdown format
print("\n## Overall Results")
headers = ["Model Name", "Size (B)", "Accuracy", "Precision", "Recall", "F1 Score", "Avg Time (s)", "Failed Tests"]
table = tabulate(overall_results, headers, tablefmt="pipe")
print(table)