### Requirements

In [1]:
import os
import re
import pandas as pd
# pip install nltk scikit-learn rouge-score
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

### Iterating all generated text in a folder to get the sparql output, ignore the input natural language question

In [2]:
def get_files_in_folder(folder_path):
    """
    Get a list of all files in a folder.

    Args:
        folder_path (str): Path to the folder.

    Returns:
        list: List of file names.
    """
    files = []
    try:
        for entry in os.listdir(folder_path):
            entry_path = os.path.join(folder_path, entry)
            if os.path.isfile(entry_path):
                files.append(entry)
    except FileNotFoundError:
        print(f"The folder '{folder_path}' does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")
    
    return files

def calculate_bleu(reference, hypothesis):
    """
    Calculate BLEU score between reference and hypothesis.
    """
    smooth = SmoothingFunction().method1
    return sentence_bleu([reference.split()], hypothesis.split(), smoothing_function=smooth)

def calculate_rouge(reference, hypothesis):
    """
    Calculate ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L).
    """
    rouge = Rouge()
    scores = rouge.get_scores(hypothesis, reference, avg=True)
    return scores


### Iterate all files to extract the SPARQL queries, and save them in a new foldder

In [None]:
folder_path = "results/generated_text/llama3.2_3b_instruct_lora"
file_names = get_files_in_folder(folder_path)

failed_files = []
sparql_folder = "results/clean-sparql/llama3.2_3b_instruct_lora"
os.makedirs(sparql_folder, exist_ok=True)

# Extract SPARQL queries from the generated text(including the input question and the output text)
for file in file_names:
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r') as f:
        text = f.read()
        match = re.search(r"Generated SPARQL:(.*?SELECT.*?)SELECT", text, re.DOTALL)
        if match:
            extracted_text = match.group(1).strip()
            # print(f"File: {file}\nExtracted Text:\n{extracted_text}\n")
            # save the extracted text to a new file with the name of the original file(question id)
            new_file_path = os.path.join(sparql_folder, f"{file}")
            with open(new_file_path, 'w') as new_file:
                new_file.write(extracted_text)
        else:
            print(f"No match found in file: {file}\n")
            failed_files.append(file)

print(f"Failed to extract SPARQL queries from {len(failed_files)} files.")

### BLEU

In [None]:
# Load the CSV file with the test questions
csv_file = 'xueli_data/test_questions.csv'
df = pd.read_csv(csv_file)
# print(df.head())

bleu_score_list = []
rouge_1_score_list = []
rouge_2_score_list = []
rouge_l_score_list = []

# Loop through each file in the folder
sparql_folder = "results/clean-sparql/llama3.2_3b_instruct_lora"
sparql_list = get_files_in_folder(sparql_folder)
# print("Files in folder:", file_list[0:5])

# Attension: not all the file in the test_questions.csv are in the clean_sparql folder
for file in sparql_list:
    # Load the content of the text file
    with open(os.path.join(sparql_folder, file), 'r') as f:
        generated_sparql = f.read()
        question_id = file.split('.')[0]
        # print(f'generated_sparql: {generated_sparql}')
    # Get the ground truth SPARQL query
    sparql = df[df['id'] == question_id]['query'].values[0]

    # Calculate BLEU and ROUGE scores
    bleu_score = calculate_bleu(sparql, generated_sparql)
    rouge_scores = calculate_rouge(sparql, generated_sparql)
    rouge_1 = rouge_scores['rouge-1']['f']
    rouge_2 = rouge_scores['rouge-2']['f']
    rouge_l = rouge_scores['rouge-l']['f']

    # Append the scores to the lists
    bleu_score_list.append(bleu_score)
    rouge_1_score_list.append(rouge_1)
    rouge_2_score_list.append(rouge_2)
    rouge_l_score_list.append(rouge_l)

# calculate the average scores
avg_bleu_score = sum(bleu_score_list) / len(bleu_score_list)
avg_rouge_1_score = sum(rouge_1_score_list) / len(rouge_1_score_list)
avg_rouge_2_score = sum(rouge_2_score_list) / len(rouge_2_score_list)
avg_rouge_l_score = sum(rouge_l_score_list) / len(rouge_l_score_list)
print(f"Average BLEU Score: {avg_bleu_score:.2f}")
print(f"Average ROUGE-1 Score: {avg_rouge_1_score:.2f}")
print(f"Average ROUGE-2 Score: {avg_rouge_2_score:.2f}")
print(f"Average ROUGE-L Score: {avg_rouge_l_score:.2f}")