## This is a the script to evaluate methods
1. Run method i on test examples
2. The results will be stored in json file
3. Evaluate results statistics
4. Repeat 1-3

In [46]:
import re
import json
import os
import ast
import random
from datetime import datetime
from openai import OpenAI
from dotenv import load_dotenv
from utils.labeling_utils import re_analyze_email, gpt_label, gpt_label_eg, gpt_label_ft
from utils.evaluation_utils import evaluate_label_single, calculate_overall_metrics

In [47]:
test_data_path = "./data/PAIRED_test.json"
with open(test_data_path, 'r') as file:
    test_data = json.load(file)

train_data_path = "./data/PAIRED_train.json"
with open(train_data_path, 'r') as file:
    train_data = json.load(file)

In [48]:
default_weights = {
    "Time_Sensitive": 0.2,
    "Type": 0.15,
    "Category": 0.15,
    "Format": 0.1,
    "Time Period": 0.2,
    "Priority_Level": 0.2,
}

weights = {
    "Time_Sensitive": 0.2,
    "Type": 0.1,
    "Category": 0.1,
    "Format": 0.1,
    "Time Period": 0.4,
    "Priority_Level": 0.1,
}

In [49]:
def parse_label(label):
    """
    Detects the type of `label` and processes it accordingly.
    
    Parameters:
        label (str or dict): Input label to process.
    
    Returns:
        dict: A valid label dictionary, or a default label if input is invalid.
    """
    default_label = {
        "Spam": "No",
        "Subject": "Default Evaluation Label",
        "Sender": "",
        "send_date": "",
        "Time_Sensitive": "No",
        "Start": "",
        "End": "",
        "Type": "N/A",
        "Category": "N/A",
        "Format": "N/A",
        "Location": "",
        "Action_Required": "No",
        "Priority_Level": "N/A"
    }

    if isinstance(label, str):
        try:
            # Try parsing the string into a dictionary
            label = ast.literal_eval(label)
        except (ValueError, SyntaxError):
            # Return default label if parsing fails
            return default_label

    if isinstance(label, dict):
        # Ensure it is a valid dictionary
        return label
    
    # If neither string nor dictionary, return the default label
    return default_label

In [50]:
## Test evaluation metrics
regex_results = []
for data in test_data:
    result = evaluate_label_single(data['label'], data['label'], weights=weights)
    regex_results.append(result)
calculate_overall_metrics(regex_results)

{'Binary Metrics': {'Spam': {'ACC': 1.0, 'F1': 1.0, 'Recall': 1.0},
  'Time_Sensitive': {'ACC': 1.0, 'F1': 1.0, 'Recall': 1.0}},
 'Categorical Metrics': {'Time Period': 1.0,
  'Type': 1.0,
  'Category': 1.0,
  'Format': 1.0,
  'Priority_Level': 1.0},
 'Averaged Weighted Score': 1.0}

## 1. REGEX baseline

Very hard to use REGEX to catch natural language time expressions

In [51]:
regex_save_path = "./test/EVAL_regex_baseline.json"

In [52]:
regex_results = []
save_results = []
for data in test_data:
    label = re_analyze_email(data['content']) 
    label = parse_label(label)
    result = evaluate_label_single(label, data['label'],weights=weights)
    regex_results.append(result)
    save_results.append({
        "content": data['content'],
        "pred_label": label,
        "true_label": data['label'],
        "result": result
    })

In [53]:
metrics = calculate_overall_metrics(regex_results)
eval_regex = {
    "eval": save_results,
    "metrics": metrics
}
with open(regex_save_path,"w") as f:
    json.dump(eval_regex, f, indent=4)

In [54]:
metrics

{'Binary Metrics': {'Spam': {'ACC': 0.56,
   'F1': 0.717948717948718,
   'Recall': 0.56},
  'Time_Sensitive': {'ACC': 0.0, 'F1': 0.0, 'Recall': 0.0}},
 'Categorical Metrics': {'Time Period': 0.36,
  'Type': 0.44,
  'Category': 0.4,
  'Format': 0.44,
  'Priority_Level': 0.24},
 'Averaged Weighted Score': 0.16400000000000003}

## 2. GPT with prompt

In [55]:
load_dotenv()
# Load the key to call the client.
client = OpenAI()
model_name="gpt-4o-mini"

In [56]:
gpt_prompt_save_path = "./test/EVAL_prompt_baseline.json"

In [57]:
prompt_results = []
save_results = []
for data in test_data:
    label = gpt_label(client, data['content'], temperature=0.7, model=model_name)
    # label = ast.literal_eval(label)
    label = parse_label(label)
    result = evaluate_label_single(label, data['label'],weights=weights)
    prompt_results.append(result)
    save_results.append({
        "content": data['content'],
        "pred_label": label,
        "true_label": data['label'],
        "result": result
    })

In [58]:
metrics = calculate_overall_metrics(prompt_results)
eval_prompt = {
    "eval": save_results,
    "metrics": metrics
}
with open(gpt_prompt_save_path,"w") as f:
    json.dump(eval_prompt, f, indent=4)

## 3. GPT prompt with examples

In [59]:
gpt_prompt_eg_save_path = "./test/EVAL_prompt_eg.json"

In [60]:
prompt_results = []
save_results = []
for data in test_data:
    label = gpt_label_eg(client, data['content'], train_data, temperature=0.7, model=model_name)
    # label = ast.literal_eval(label)
    label = parse_label(label)
    result = evaluate_label_single(label, data['label'],weights=weights)
    prompt_results.append(result)
    save_results.append({
        "content": data['content'],
        "pred_label": label,
        "true_label": data['label'],
        "result": result
    })
    random.shuffle(train_data)

In [61]:
metrics = calculate_overall_metrics(prompt_results)
eval_prompt = {
    "eval": save_results,
    "metrics": metrics
}
with open(gpt_prompt_eg_save_path,"w") as f:
    json.dump(eval_prompt, f, indent=4)

## 4. Fine-tuned Model

In [62]:
gpt_prompt_ft_save_path = "./test/EVAL_prompt_ft.json"
ft_model_name = "ft:gpt-4o-mini-2024-07-18:personal:ft-schedular:AXN6Qt3B"

In [63]:
prompt_results = []
save_results = []
for data in test_data:
    label = gpt_label_ft(client, data['content'], temperature=0.7, model=ft_model_name)
    # label = ast.literal_eval(label)
    label = parse_label(label)
    result = evaluate_label_single(label, data['label'],weights=weights)
    prompt_results.append(result)
    save_results.append({
        "content": data['content'],
        "pred_label": label,
        "true_label": data['label'],
        "result": result
    })

metrics = calculate_overall_metrics(prompt_results)
eval_prompt = {
    "eval": save_results,
    "metrics": metrics
}
with open(gpt_prompt_ft_save_path,"w") as f:
    json.dump(eval_prompt, f, indent=4)