In [1]:
import csv
import ast
import random

### Load CSV Rephrased Data

In [2]:
def load_csv_data(file_path, bool_params=[]):
    # Initialize an empty list to store the data
    data_list = []

    # Open the CSV file for reading
    with open(file_path, newline='') as csvfile:
        # Create a CSV reader object
        csv_reader = csv.DictReader(csvfile)
        
        # Iterate through each row in the CSV file
        for row in csv_reader:
            # Append the row (as a dictionary) to the data_list
            row["choices"] = ast.literal_eval(row["choices"])

            for param in bool_params:
                if row[param] == "True" or row[param] == "TRUE":
                    row[param] = True
                elif row[param] == "False" or row[param] == "FALSE":
                    row[param] = False
                else:
                    raise TypeError(f"{param} data cannot be recognized")

            data_list.append(row)
    
    return data_list

def load_all_rephrase_data(split, prompt_ver, dir_path, file_name, bool_params=[]):
    data = {}
    
    for s in split:
        file_path = f"{dir_path}/{s}/{prompt_ver}_{s}_{file_name}"
        data[s] = load_csv_data(file_path, bool_params)
    
    return data


split = ["validation", "test", "train"]
name_params = ["concept", "name", "option", "verdict"]
all_params = ["concept", "name", "option", "low_verdict", "high_verdict"]

evaluated_data = {
    "name": [
        load_all_rephrase_data(split, "v3", "eval", "name.csv", bool_params=name_params),
    ],
    "both": [
        load_all_rephrase_data(split, "v3", "eval", "both.csv", bool_params=all_params),
    ],
    "concept": [
        load_all_rephrase_data(split, "v3", "eval", "concept.csv", bool_params=all_params),
    ],
    "option": [
        load_all_rephrase_data(split, "v3", "eval", "option.csv", bool_params=all_params),
    ]
}

In [3]:
for s in split:
    print("====================================")
    print(f"Data Split: {s}")

    all_data = [0]
    all_true = [0]

    for eval_type, eval_data in evaluated_data.items():
        if eval_type == "name":
            eval_attr = "verdict"
        else:
            eval_attr = "high_verdict"

        # assert len(eval_data[0][s]) == len(eval_data[1][s])

        v1_true = sum(1 for d in eval_data[0][s] if d[eval_attr])
        # v2_true = sum(1 for d in eval_data[1][s] if d[eval_attr])

        if eval_type != "name":
            all_data[0] += len(eval_data[0][s])
            # all_data[1] += len(eval_data[1][s])
            all_true[0] += v1_true
            # all_true[1] += v2_true

        print(f"""
Evaluation Attributes: {eval_type}
Number of Sampled Data: {len(eval_data[0][s])}
v3 Prompt Results:
    - True: {v1_true}
    - False: {len(eval_data[0][s]) - v1_true}
    - Accuracy: {v1_true / len(eval_data[0][s])}""")

    # assert all_data[0] == all_data[1]
    print(f"""
Rephrase All Evaluation Conclusion
Number of Sampled Data: {all_data[0]}
v3 Prompt Results:
    - True: {all_true[0]}
    - False: {all_data[0] - all_true[0]}
    - Accuracy: {all_true[0] / all_data[0]}""")

Data Split: validation

Evaluation Attributes: name
Number of Sampled Data: 51
v3 Prompt Results:
    - True: 48
    - False: 3
    - Accuracy: 0.9411764705882353

Evaluation Attributes: both
Number of Sampled Data: 10
v3 Prompt Results:
    - True: 5
    - False: 5
    - Accuracy: 0.5

Evaluation Attributes: concept
Number of Sampled Data: 23
v3 Prompt Results:
    - True: 19
    - False: 4
    - Accuracy: 0.8260869565217391

Evaluation Attributes: option
Number of Sampled Data: 52
v3 Prompt Results:
    - True: 42
    - False: 10
    - Accuracy: 0.8076923076923077

Rephrase All Evaluation Conclusion
Number of Sampled Data: 85
v3 Prompt Results:
    - True: 66
    - False: 19
    - Accuracy: 0.7764705882352941
Data Split: test

Evaluation Attributes: name
Number of Sampled Data: 46
v3 Prompt Results:
    - True: 44
    - False: 2
    - Accuracy: 0.9565217391304348

Evaluation Attributes: both
Number of Sampled Data: 8
v3 Prompt Results:
    - True: 6
    - False: 2
    - Accuracy: 0.7