# Spice Evaluation

In [1]:
import editdistance
import pandas as pd
import os
import subprocess
from typing import List, Dict
from Models import *
from Database_Query import *
from Result_Type import *
from Import import get_file_paths, get_text_from_file
from Dataset_stats import list_question_types, list_question_type_ids
from Helpers import clean_prediction

## Evaluation Script

Command to run the evaluation for one question type: You need to start the SPICE database first:
```
python3 run_subtype_lf.py \
    --file_path ../../SPICE_code/Evaluation_test_data.json \
    --question_type "Simple Question (Direct)" \
    --em_only False \
    --server_link http://localhost:9999/blazegraph/namespace/wd/sparql \
    --out_eval_file ../../SPICE_code/Test_Eval_result.json
```

### Create Evaluation Files

In [12]:
input_path = "./Results/csv/Predictions/Test"
output_path = "./Results/json/Evaluation/Test"
model = ModelType.LORA
prompt_type = "zero-shot-chat-history-512"
clean = True

In [13]:
def handle_description(description) -> str:
    # Ensure that description is string
    return f"{description}"

def handle_response_entities(response_entity: str) -> List[str]:
    try:
        # replace double quotes with single quotes
        response_entity = response_entity.replace('"', "'")

        # make it a valid json
        response_entity = response_entity.replace("{'", '{"')
        response_entity = response_entity.replace("': ", '": ')
        response_entity = response_entity.replace(": '", ': "')
        response_entity = response_entity.replace("', ", '", ')
        response_entity = response_entity.replace(", '", ', "')
        response_entity = response_entity.replace("'}", '"}')

        response_entity = json.loads(response_entity)
        result = []
        for key, _ in response_entity.items():
            result.append(key)
        return result
    except Exception as e:
        print(f"Error while parsing str: {response_entity} with error: {e}")
        raise e

handle_response_entities("{'Q1517750': 'Medal ""For Strengthening of Brotherhood in Arms""', 'Q728960': 'Order of the October Revolution', 'Q583881': 'Order of Friendship of Peoples'}")

['Q1517750', 'Q728960', 'Q583881']

In [14]:
evaluation_file_df = pd.DataFrame(columns=['question_type', 'description', 'question', 'answer', 'actions', 'results', 'sparql_delex', 'turnID'])
file_paths = get_file_paths(input_path)

# Remove all files that do not contain the model name and prompt type
file_paths = [file_path for file_path in file_paths if model.value in file_path and prompt_type in file_path]

# In case of Lora zero-shot-chat-history, remove all files that contain zero-shot-chat-history-512
if model == ModelType.LORA and prompt_type == "zero-shot-chat-history":
    file_paths = [file_path for file_path in file_paths if "zero-shot-chat-history-512" not in file_path]

for file_path in file_paths:
    file_name = file_path.split("/")[-1].split(".csv")[0]
    folder_name = file_path.split("/")[-2]
    print(f'Processing folder: {folder_name}, file:{file_name}')

    predictions_df = pd.read_csv(file_path)

    # Create a temporary DataFrame to store the extracted values
    temp_df = pd.DataFrame(columns=evaluation_file_df.columns)

    # Add every second value of question_type to the evaluation dataframe
    temp_df['question_type'] = predictions_df['question_type'][::2].reset_index(drop=True)

    # Add every second value of description to the evaluation dataframe
    temp_df['description'] = predictions_df['description'][::2].apply(handle_description).reset_index(drop=True)

    # Add every second value of utterance to the evaluation dataframe
    temp_df['question'] = predictions_df['utterance'][::2].reset_index(drop=True)

    # Add every second value of utterance to the evaluation dataframe starting at index 1
    temp_df['answer'] = predictions_df['utterance'][1::2].reset_index(drop=True)

    if clean:
        temp_df['actions'] = predictions_df[f'prediction_{model.value}'][::2].apply(clean_prediction).reset_index(drop=True)
    else:
        temp_df['actions'] = predictions_df[f'prediction_{model.value}'][::2].reset_index(drop=True)

    # Add every second value of all_response_entities to the evaluation dataframe starting at index 1
    temp_df['results'] = predictions_df['all_response_entities'][1::2].apply(handle_response_entities).reset_index(drop=True)

    # Add sparql queries to the evaluation dataframe
    temp_df['sparql_delex'] = predictions_df['sparql_query'][1::2].apply(clean_prediction).reset_index(drop=True)

    # Add every second value of turnId to the evaluation dataframe
    temp_df['turnID'] = predictions_df['turnID'][::2].reset_index(drop=True)

    # Append the temporary DataFrame to the main evaluation_file_df
    evaluation_file_df = pd.concat([evaluation_file_df, temp_df], ignore_index=True)

# Export the evaluation dataframe to a json file
evaluation_file_df.to_json(f'{output_path}/test_eval_file_{model.value}_{prompt_type}.json', orient='records', lines=False)

evaluation_file_df.shape

Processing folder: QA_0, file:QA_90_lora-7b_zero-shot-chat-history-512
Processing folder: QA_2, file:QA_35_lora-7b_zero-shot-chat-history-512
Processing folder: QA_2, file:QA_80_lora-7b_zero-shot-chat-history-512
Processing folder: QA_4, file:QA_24_lora-7b_zero-shot-chat-history-512
Processing folder: QA_4, file:QA_39_lora-7b_zero-shot-chat-history-512
Processing folder: QA_8, file:QA_61_lora-7b_zero-shot-chat-history-512
Processing folder: QA_8, file:QA_74_lora-7b_zero-shot-chat-history-512
Processing folder: QA_12, file:QA_30_lora-7b_zero-shot-chat-history-512
Processing folder: QA_15, file:QA_56_lora-7b_zero-shot-chat-history-512
Processing folder: QA_20, file:QA_39_lora-7b_zero-shot-chat-history-512
Processing folder: QA_22, file:QA_45_lora-7b_zero-shot-chat-history-512
Processing folder: QA_24, file:QA_30_lora-7b_zero-shot-chat-history-512
Processing folder: QA_26, file:QA_96_lora-7b_zero-shot-chat-history-512
Processing folder: QA_27, file:QA_1_lora-7b_zero-shot-chat-history-512


(1500, 8)

### Execute Evaluation Script

* Execute it for each question type

In [15]:
dataset_input_path = "../SPICE_dataset/test"

question_types = list_question_types(dataset_input_path)
question_type_ids = list_question_type_ids(dataset_input_path)
print(question_types)
print(len(question_types))
print(question_type_ids)

['Simple Question (Direct)', 'Simple Question (Coreferenced)', 'Clarification', 'Comparative Reasoning (All)', 'Comparative Reasoning (Count) (All)', 'Quantitative Reasoning (Count) (All)', 'Logical Reasoning (All)', 'Verification (Boolean) (All)', 'Simple Question (Ellipsis)', 'Quantitative Reasoning (All)']
10
[1, 2, 3, 7, 8, 4, 5, 6]


In [16]:
for question_type in question_types:
    file_path = f"./Results/json/Evaluation/Test/test_eval_file_{model.value}_{prompt_type}.json"
    out_eval_file = f"./Results/json/Evaluation/Test/{question_type}/{model.value}_{prompt_type}.json"
    out_eval_folder = out_eval_file.split("/")[:-1]
    out_eval_folder = "/".join(out_eval_folder)

    # Check if folder exists and create it if not
    if not os.path.exists(out_eval_folder):
        os.makedirs(out_eval_folder)

    # Execute evaluation script
    print(f"Executing evaluation script for {question_type}")
    # If it consumes too much memory or gets stuck, try to run the command in the terminal
    cmd = [
        'python3',
        '../SPICE_repo/evaluation/run_subtype_lf.py',
        '--file_path', file_path,
        '--question_type', question_type,
        '--em_only', 'False',
        '--server_link', 'http://localhost:9999/blazegraph/namespace/wd/sparql',
        '--out_eval_file', out_eval_file,
        '--context_dist_file', '../SPICE_dataset/context_distance_test.log'
    ]
    subprocess.run(cmd)

Executing evaluation script for Simple Question (Direct)
*	Evaluating: 
['./Results/json/Evaluation/Test/test_eval_file_lora-7b_zero-shot-chat-history-512.json']
{'question_type': 'Simple Question (Direct)', 'description': 'Simple Question', 'question': 'What is the building where Dominique Colas was educated ?', 'answer': 'Lycée Thiers', 'actions': 'SELECT ?x WHERE { wd:Q3035075 wdt:P69 ?x . ?x wdt:P31 wd:Q41176 . }', 'results': ['Q3268957'], 'sparql_delex': 'SELECT ?x WHERE { wd:Q3035075 wdt:P69 ?x . ?x wdt:P31 wd:Q41176 . }', 'turnID': 'test#QA_0#QA_90#0'}
{'head': {'vars': ['x']}, 'results': {'bindings': [{'x': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q3268957'}}]}}
{'x': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q3268957'}}
x
{'question_type': 'Simple Question (Direct)', 'description': 'Simple Question|Single Entity', 'question': 'Which administrative territory is Lycée Thiers situated in ?', 'answer': 'France', 'actions': 'SELECT ?x WHERE { wd:Q3268

### Create Summary

In [17]:
summary_data = []

for question_type in question_types:
    folder_path = f"./Results/json/Evaluation/Test/{question_type}"

    for file in os.listdir(folder_path):
        file_name = file.split('.json')[0]
        data = json.loads(get_text_from_file(f"{folder_path}/{file}"))
        result = data[question_type]

        # Check if a row with the same 'model-prompt' already exists
        existing_row_index = None
        for index, row in enumerate(summary_data):
            if row['model-prompt'] == file_name:
                existing_row_index = index
                break

        if existing_row_index is not None:
            # Update the existing row with the new question_type result
            summary_data[existing_row_index][question_type] = result
        else:
            # Create a new row for the dataframe with the file_name as model-prompt
            new_row = {'model-prompt': file_name, question_type: result}
            summary_data.append(new_row)

summary_df = pd.DataFrame(summary_data)
# sort summary_df by model-prompt
summary_df = summary_df.sort_values(by=['model-prompt'])
summary_df.to_csv('./Results/csv/Evaluations/Test/summary_general_evaluation_v01.csv', index=False)
summary_df.head(10)

Unnamed: 0,model-prompt,Simple Question (Direct),Simple Question (Coreferenced),Clarification,Comparative Reasoning (All),Comparative Reasoning (Count) (All),Quantitative Reasoning (Count) (All),Logical Reasoning (All),Verification (Boolean) (All),Simple Question (Ellipsis),Quantitative Reasoning (All)
3,gpt-3.5-turbo-0613_few-shot-chat-history,"{'instances': 460.0, 'precision': 0.9940357852...","{'instances': 308.0, 'precision': 0.5529010238...","{'instances': 67.0, 'precision': 0.0, 'recall'...","{'instances': 79.0, 'precision': 0.18333333333...","{'instances': 79, 'accuracy': 0.0, 'em': 0.0}","{'instances': 132, 'accuracy': 0.4848484848484...","{'instances': 122.0, 'precision': 0.9700699300...","{'instances': 148, 'accuracy': 0.9256756756756...","{'instances': 57.0, 'precision': 0.79545454545...","{'instances': 48.0, 'precision': 0.09046454767..."
8,gpt-3.5-turbo-0613_zero-shot-chat-history,"{'instances': 460.0, 'precision': 0.9559471365...","{'instances': 308.0, 'precision': 0.4788321167...","{'instances': 67.0, 'precision': 0.0, 'recall'...","{'instances': 79.0, 'precision': 0.81538461538...","{'instances': 79, 'accuracy': 0.0, 'em': 0.0}","{'instances': 132, 'accuracy': 0.1969696969696...","{'instances': 122.0, 'precision': 0.9638124362...","{'instances': 148, 'accuracy': 0.0, 'em': 0.0}","{'instances': 57.0, 'precision': 0.32098765432...","{'instances': 48.0, 'precision': 0.05270006506..."
1,llama-7b_few-shot-chat-history,"{'instances': 460.0, 'precision': 0.9896049896...","{'instances': 308.0, 'precision': 0.9857651245...","{'instances': 67.0, 'precision': 0.0, 'recall'...","{'instances': 79.0, 'precision': 0.07692307692...","{'instances': 79, 'accuracy': 0.0, 'em': 0.0}","{'instances': 132, 'accuracy': 0.1515151515151...","{'instances': 122.0, 'precision': 0.9831223628...","{'instances': 148, 'accuracy': 0.0, 'em': 0.0}","{'instances': 57.0, 'precision': 0, 'recall': ...","{'instances': 48.0, 'precision': 0, 'recall': ..."
5,llama-7b_zero-shot-chat-history,"{'instances': 460.0, 'precision': 0, 'recall':...","{'instances': 308.0, 'precision': 0, 'recall':...","{'instances': 67.0, 'precision': 0, 'recall': ...","{'instances': 79.0, 'precision': 0, 'recall': ...","{'instances': 79, 'accuracy': 0.0, 'em': 0.0}","{'instances': 132, 'accuracy': 0.0, 'em': 0.0}","{'instances': 122.0, 'precision': 0, 'recall':...","{'instances': 148, 'accuracy': 0.0, 'em': 0.0}","{'instances': 57.0, 'precision': 0, 'recall': ...","{'instances': 48.0, 'precision': 0, 'recall': ..."
6,lora-7b_few-shot-chat-history,"{'instances': 460.0, 'precision': 0.9971098265...","{'instances': 308.0, 'precision': 0.9947753396...","{'instances': 67.0, 'precision': 0.0, 'recall'...","{'instances': 79.0, 'precision': 0.8, 'recall'...","{'instances': 79, 'accuracy': 0.0, 'em': 0.0}","{'instances': 132, 'accuracy': 0.4924242424242...","{'instances': 122.0, 'precision': 0.9765816148...","{'instances': 148, 'accuracy': 0.9256756756756...","{'instances': 57.0, 'precision': 0.87179487179...","{'instances': 48.0, 'precision': 0, 'recall': ..."
7,lora-7b_zero-shot-chat-history,"{'instances': 460.0, 'precision': 0.9994134897...","{'instances': 308.0, 'precision': 0.9961277831...","{'instances': 67.0, 'precision': 0.0, 'recall'...","{'instances': 79.0, 'precision': 0, 'recall': ...","{'instances': 79, 'accuracy': 0.0, 'em': 0.0}","{'instances': 132, 'accuracy': 0.5909090909090...","{'instances': 122.0, 'precision': 0.9978858350...","{'instances': 148, 'accuracy': 0.9391891891891...","{'instances': 57.0, 'precision': 1.0, 'recall'...","{'instances': 48.0, 'precision': 0, 'recall': ..."
0,lora-7b_zero-shot-chat-history-512,"{'instances': 460.0, 'precision': 0.9994134897...","{'instances': 308.0, 'precision': 0.9962013295...","{'instances': 67.0, 'precision': 0.0, 'recall'...","{'instances': 79.0, 'precision': 0.58881256133...","{'instances': 79, 'accuracy': 0.16455696202531...","{'instances': 132, 'accuracy': 0.5909090909090...","{'instances': 122.0, 'precision': 0.9978858350...","{'instances': 148, 'accuracy': 0.9391891891891...","{'instances': 57.0, 'precision': 1.0, 'recall'...","{'instances': 48.0, 'precision': 0, 'recall': ..."
2,vicuna-7b_few-shot-chat-history,"{'instances': 460.0, 'precision': 0.8587570621...","{'instances': 308.0, 'precision': 0.9714285714...","{'instances': 67.0, 'precision': 0.02325581395...","{'instances': 79.0, 'precision': 0.8234375, 'r...","{'instances': 79, 'accuracy': 0.0, 'em': 0.0}","{'instances': 132, 'accuracy': 0.0909090909090...","{'instances': 122.0, 'precision': 1.0, 'recall...","{'instances': 148, 'accuracy': 0.3648648648648...","{'instances': 57.0, 'precision': 0, 'recall': ...","{'instances': 48.0, 'precision': 0.05426356589..."
4,vicuna-7b_zero-shot-chat-history,"{'instances': 460.0, 'precision': 0.6, 'recall...","{'instances': 308.0, 'precision': 0, 'recall':...","{'instances': 67.0, 'precision': 0, 'recall': ...","{'instances': 79.0, 'precision': 0, 'recall': ...","{'instances': 79, 'accuracy': 0.0, 'em': 0.0}","{'instances': 132, 'accuracy': 0.0, 'em': 0.0}","{'instances': 122.0, 'precision': 0, 'recall':...","{'instances': 148, 'accuracy': 0.0, 'em': 0.0}","{'instances': 57.0, 'precision': 0, 'recall': ...","{'instances': 48.0, 'precision': 0, 'recall': ..."


In [18]:
summary_data = []

for question_type in question_types:
    folder_path = f"./Results/json/Evaluation/Test/{question_type}"

    for file in os.listdir(folder_path):
        file_name = file.split('.json')[0]
        data = json.loads(get_text_from_file(f"{folder_path}/{file}"))
        result = data[question_type]
        
        # Prepare data for DataFrame by adding 'model-prompt' and 'question_type' keys
        result.update({'model-prompt': file_name, 'question_type': question_type})
        summary_data.append(result)

# Create the DataFrame
summary_df = pd.DataFrame(summary_data)

# Set 'model-prompt' and 'question_type' as the index
summary_df.set_index(['model-prompt', 'question_type'], inplace=True)
# Export to csv sorted by model-prompt
summary_df.to_csv('./Results/csv/Evaluations/Test/summary_general_evaluation_v02.csv', index=True)
summary_df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,instances,precision,recall,f1score,macro-f1score,em,accuracy
model-prompt,question_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
lora-7b_zero-shot-chat-history-512,Simple Question (Direct),460.0,0.999413,0.766187,0.867396,0.981413,0.969565,
llama-7b_few-shot-chat-history,Simple Question (Direct),460.0,0.989605,0.214029,0.351941,0.751024,0.723913,
vicuna-7b_few-shot-chat-history,Simple Question (Direct),460.0,0.858757,0.068345,0.126614,0.246063,0.230435,
gpt-3.5-turbo-0613_few-shot-chat-history,Simple Question (Direct),460.0,0.994036,0.67446,0.803643,0.860638,0.741304,
vicuna-7b_zero-shot-chat-history,Simple Question (Direct),460.0,0.6,0.001349,0.002692,0.003188,0.0,
llama-7b_zero-shot-chat-history,Simple Question (Direct),460.0,0.0,0.0,0.0,0.0,0.0,
lora-7b_few-shot-chat-history,Simple Question (Direct),460.0,0.99711,0.930755,0.962791,0.940108,0.917391,
lora-7b_zero-shot-chat-history,Simple Question (Direct),460.0,0.999413,0.766187,0.867396,0.981413,0.969565,
gpt-3.5-turbo-0613_zero-shot-chat-history,Simple Question (Direct),460.0,0.955947,0.195144,0.324122,0.500224,0.336957,
lora-7b_zero-shot-chat-history-512,Simple Question (Coreferenced),308.0,0.996201,0.806923,0.891628,0.874675,0.873377,
