In [28]:
import pandas as pd

import re

def process_sparql_result(raw_result: str):
    """
    Process SPARQL SELECT result string:
    1. Remove variable names (header row).
    2. Extract and clean data rows into a set of tuples.
    """
    # Step 1: Split by line
    try:
        lines = raw_result.strip().split('\n')
    except:
        # print("Error: Unable to split the result string into lines.")
        # return an empty set if the input is not a valid string
        return set()

    # Step 2: Remove the first line (variable names)
    data_lines = lines[1:] if lines[0].startswith('?') else lines

    result_set = set()
    for line in data_lines:
        # Split by tab
        columns = line.strip().split('\t')

        # Clean RDF literal suffixes like ^^<http://...>
        cleaned_columns = [
            col.split('^^')[0] if '^^' in col else col
            for col in columns
        ]

        # Add as tuple to set
        result_set.add(tuple(cleaned_columns))

    return result_set


# define a function to compare the content of two rows with row names "message" and "gt_message"
def get_exact_match(file):
    """"
    "Compare the content of two rows with row names "message" and "gt_message"
    if they are the same, EM = 1,
    if they are different, EM = 0
    return the average EM value
    """
    # read the csv file
    df = pd.read_csv(file)
    print(f"The number of all sparql queries is {len(df)}")

    # filter the dataframe to only include rows where the status is "SUCCESS", and create a new dataframe
    df_success = df[df['status'] == 'SUCCESS'].copy()
    print(f"Then number of the SUCCESS sparql queries is {len(df_success)}")

    # compare each row with the gt_message
    df_success['EM'] = df_success.apply(lambda row: 1 if process_sparql_result(row['message']) == process_sparql_result(row['gt_message']) else 0, axis=1)
    
    # print the average EM value for all the SUCCESS queries
    average_em_success_set = df_success['EM'].mean()
    print(f"The average EM value for all the SUCCESS sparql queries is {average_em_success_set}")
    # print the average EM value for all the queries
    average_em_test_set = df_success['EM'].sum()/ len(df)
    print(f"The average EM value for all the test sparql queries is {average_em_test_set}")
    success_ids = df_success['question_id'][df_success['EM'] == 1]
    # save the ids to a file
    with open("success_ids.txt", "w") as f:
        for id in success_ids:
            f.write(f"{id}\n")
    return average_em_success_set, average_em_test_set


def relaxed_exact_match(test_set1, ref_set2, threshold=0.5):
    intersection = test_set1 & ref_set2
    if len(ref_set2) == 0:
        overlap_ratio = 0
        return 0, overlap_ratio
    overlap_ratio = len(intersection) / len(ref_set2)
    if overlap_ratio >= threshold:
        return 1, overlap_ratio
    else:
        return 0, overlap_ratio
    

def get_relaxed_exact_match(file, theshold=0.5):
    # read the csv file
    df = pd.read_csv(file)
    # print(f"The number of all sparql queries is {len(df)}")

    # filter the dataframe to only include rows where the status is "SUCCESS", and create a new dataframe
    df_success = df[df['status'] == 'SUCCESS'].copy()
    # print(f"Then number of the SUCCESS sparql queries is {len(df_success)}")

    # compare each row, create set for the gt_message and the message
    for i in range(len(df_success)):
        # create a set for the gt_message
        gt_set = process_sparql_result(df_success.iloc[i]['gt_message'])
        # create a set for the message
        message_set = process_sparql_result(df_success.iloc[i]['message'])
        # compare the two sets, if they are the same, EM = 1, if they are different, EM = 0
        df_success.loc[i, "relaxed_EM"] = relaxed_exact_match(message_set, gt_set, theshold)[0]
        df_success.loc[i, 'overlap_ratio'] = relaxed_exact_match(message_set, gt_set, theshold)[1]

    # print the average EM value for all the SUCCESS queries
    average_em_success_set = df_success['relaxed_EM'].mean()
    # print(f"The average relaxed EM value for all the SUCCESS sparql queries is {average_em_success_set}")
    # print the average EM value for all the queries
    average_em_test_set = df_success['relaxed_EM'].sum()/ len(df)
    # print(f"The average relaxed EM value for all the test sparql queries is {average_em_test_set}")
    # get the ids of the queries with relaxed_EM = 1
    success_ids = df_success['question_id'][df_success['relaxed_EM'] == 1]
    # save the ids to a file
    with open("success_ids.txt", "w") as f:
        for id in success_ids:
            f.write(f"{id}\n")
    return average_em_success_set, average_em_test_set

In [33]:
# read the results in csv file
file = "results/step3_sparql_running_against_orkg/ft/llama3.2_3b_lora_terminal/retrieved_results.csv"
em = get_exact_match(file)
# df_success_em.head(1)['message'].values[0] # print the first message
print("####################")


The number of all sparql queries is 513
Then number of the SUCCESS sparql queries is 195
The average EM value for all the SUCCESS sparql queries is 0.7897435897435897
The average EM value for all the test sparql queries is 0.3001949317738791
####################


In [None]:
# set the theshold from 0 to 1, with step 0.1
theshold_lst = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
relaxed_em_success_lst = []
relaxed_em_test_lst = []
for theshold in theshold_lst:
    # get the relaxed exact match
    average_em_success_set, average_em_test_set = get_relaxed_exact_match(file, theshold=theshold)
    relaxed_em_success_lst.append(average_em_success_set)
    relaxed_em_test_lst.append(average_em_test_set)
    # print(f"The average relaxed EM value for all the SUCCESS sparql queries with theshold {theshold} is {average_em_success_set}")
    # print(f"The average relaxed EM value for all the test sparql queries with theshold {theshold} is {average_em_test_set}")

print("relatexd_em_success_lst: ", relaxed_em_success_lst)
print("relatexd_em_test_lst: ", relaxed_em_test_lst)
# plot the results with relatexd_em_success_lst
import matplotlib.pyplot as plt
import numpy as np
# Set the figure size
plt.figure(figsize=(10, 6))
# Set the x-axis values
x = theshold_lst
# Set the y-axis values
y1 = relaxed_em_success_lst
# Set the color of the line
color1 = 'blue'
# Plot the first line
plt.plot(x, y1, color=color1, label='relaxed EM for SUCCESS queries')
# show the point values on the line, round to 2 decimal places
plt.scatter(x, y1, color=color1)

# Set the title of the plot
plt.title('Relaxed Exact Match for SUCCESS queries')
# Set the x-axis label
plt.xlabel('Overlap Threshold')
# Set the y-axis label
plt.ylabel('Relaxed Exact Match')
# Set the x-axis ticks
plt.xticks(np.arange(0.1, 1.1, 0.1))
# Set the legend
plt.legend()
# Show the plot
plt.show()

In [20]:
330/492

0.6707317073170732

In [21]:
193/492

0.39227642276422764

In [22]:
3/21

0.14285714285714285

In [38]:
1/30

0.03333333333333333