# Helpers

## Translate subject names to English

In [59]:
subject_translation = {
    "Matemática e suas Tecnologias": "Mathematics",
    "Linguagens, Códigos e suas Tecnologias": "Languages",
    "Ciências Humanas e suas Tecnologias": "Human Sciences",
    "Ciências da Natureza e suas Tecnologias": "Natural Sciences"
}

## Process files

In [60]:
import os
import re
import pandas as pd


def process_files(raw_dir, cleaned_dir, file_prefix, cleaning_func):
    """
    Iterates over CSV files in raw_dir that match the given file_prefix.
    For each file, it checks if the cleaned version exists in cleaned_dir.
    If not, it applies cleaning_func to create a cleaned DataFrame and saves it.
    
    Parameters:
      raw_dir (str): Directory containing the raw CSV files.
      cleaned_dir (str): Directory where cleaned CSV files will be saved.
      file_prefix (str): The prefix part of the filename (e.g., "cove_few-shot_results").
      cleaning_func (function): A function that takes a DataFrame and returns a cleaned DataFrame.
    """
    # Ensure the cleaned directory exists.
    os.makedirs(cleaned_dir, exist_ok=True)
    
    # Build a regex pattern for filenames using the provided prefix.
    # It expects filenames like <file_prefix>_YYYY-MM-DD_HH-MM-SS.csv.
    pattern = re.compile(
        rf'({re.escape(file_prefix)})_(\d{{4}}-\d{{2}}-\d{{2}}_\d{{2}}-\d{{2}}-\d{{2}})\.csv'
    )
    
    for filename in os.listdir(raw_dir):
        if not filename.endswith(".csv"):
            continue
        
        match = pattern.match(filename)
        if not match:
            print(f"Filename does not match pattern: {filename}")
            continue
        
        base_name = match.group(1)
        timestamp = match.group(2)
        # Construct the cleaned filename.
        cleaned_filename = f"{base_name}_clean_{timestamp}.csv"
        cleaned_filepath = os.path.join(cleaned_dir, cleaned_filename)
        
        if os.path.exists(cleaned_filepath):
            print(f"Cleaned file already exists: {cleaned_filename}")
            continue
        
        raw_filepath = os.path.join(raw_dir, filename)
        try:
            df = pd.read_csv(raw_filepath)
            # Apply the cleaning function.
            cleaned_df = cleaning_func(df)
            # Save the cleaned DataFrame.
            cleaned_df.to_csv(cleaned_filepath, index=False)
            print(f"Processed and saved cleaned file: {cleaned_filename}")
        except Exception as e:
            print(f"Error processing file {filename}: {e}")

## Self-Refine cleaner

In [68]:
def extract_answer_letter(response):
    # Match "Resposta final: C)" or "Resposta final: C"
    match = re.search(r"resposta final\s*[:\-]?\s*([A-E])\s*\)?", response, re.IGNORECASE | re.DOTALL)
    if not match:
        # Try fallback patterns
        match = re.search(r"letra\s+([A-E])\b", response, re.IGNORECASE)
    return match.group(1).upper() if match else None

In [62]:
def clean_self_refine(df):
    # If "initial_answer" column exists, use it.
    if "initial_answer" in df.columns:
        initial_answer_series = df["initial_answer"]
    # Otherwise, if "baseline_answer" exists, create "initial_answer" by applying extract_answer_letter.
    elif "baseline_answer" in df.columns:
        initial_answer_series = df["baseline_answer"].apply(extract_answer_letter)
    else:
        # If neither column is present, set initial_answer_series to None
        initial_answer_series = None

    cleaned_df = pd.DataFrame({
        "id": df["id"],
        "subject": df["subject"].map(subject_translation),
        "ground_truth": df["ground_truth"],
        "predicted": df["predicted"],
        "correct": df["correct"],
        "initial_answer": initial_answer_series
    })
    return cleaned_df

## CoVe cleaner

In [63]:
def clean_cove(df):
    """
    Cleaning function for the cove method.
    Keeps an extra column 'initial_answer'.
    """
    cleaned_df = pd.DataFrame({
        "id": df["id"],
        "subject": df["subject"].map(subject_translation),
        "ground_truth": df["ground_truth"],
        "predicted": df["predicted"],
        "correct": df["correct"],
        "initial_answer": df["initial_answer"]
    })
    return cleaned_df

## CoT cleaner

In [64]:
def clean_cot(df):
    """
    Cleaning function for the cot method.
    Omits the 'initial_answer' column.
    """
    cleaned_df = pd.DataFrame({
        "id": df["id"],
        "subject": df["subject"].map(subject_translation),
        "ground_truth": df["ground_truth"],
        "predicted": df["predicted"],
        "correct": df["correct"]
    })
    return cleaned_df

# Clean data

### CoVe

In [65]:
raw_dir_cove = "results/cove_few-shot"
cleaned_dir_cove = "results/cove_few-shot_clean"
process_files(raw_dir_cove, cleaned_dir_cove, "cove_few-shot_results", clean_cove)

Cleaned file already exists: cove_few-shot_results_clean_2025-04-01_02-41-38.csv
Cleaned file already exists: cove_few-shot_results_clean_2025-03-31_04-12-49.csv
Cleaned file already exists: cove_few-shot_results_clean_2025-03-28_17-19-55.csv
Cleaned file already exists: cove_few-shot_results_clean_2025-03-30_20-51-22.csv
Cleaned file already exists: cove_few-shot_results_clean_2025-03-30_18-11-48.csv
Cleaned file already exists: cove_few-shot_results_clean_2025-03-30_23-08-43.csv
Cleaned file already exists: cove_few-shot_results_clean_2025-04-01_07-45-02.csv
Cleaned file already exists: cove_few-shot_results_clean_2025-03-30_15-33-09.csv
Cleaned file already exists: cove_few-shot_results_clean_2025-03-31_16-41-31.csv
Cleaned file already exists: cove_few-shot_results_clean_2025-03-28_12-04-04.csv
Cleaned file already exists: cove_few-shot_results_clean_2025-03-30_19-30-48.csv
Cleaned file already exists: cove_few-shot_results_clean_2025-03-30_16-53-21.csv
Cleaned file already exists:

### CoT

In [66]:
raw_dir_cot = "results/cot_few-shot"
cleaned_dir_cot = "results/cot_few-shot_clean"
process_files(raw_dir_cot, cleaned_dir_cot, "cot_few-shot_results", clean_cot)

Cleaned file already exists: cot_few-shot_results_clean_2025-03-31_11-28-31.csv
Cleaned file already exists: cot_few-shot_results_clean_2025-03-31_11-07-13.csv
Cleaned file already exists: cot_few-shot_results_clean_2025-03-31_18-37-21.csv
Cleaned file already exists: cot_few-shot_results_clean_2025-03-31_01-11-32.csv
Cleaned file already exists: cot_few-shot_results_clean_2025-03-30_14-38-11.csv
Cleaned file already exists: cot_few-shot_results_clean_2025-03-28_11-16-41.csv
Cleaned file already exists: cot_few-shot_results_clean_2025-03-30_18-47-39.csv
Cleaned file already exists: cot_few-shot_results_clean_2025-04-01_04-43-47.csv
Cleaned file already exists: cot_few-shot_results_clean_2025-03-30_20-07-08.csv
Cleaned file already exists: cot_few-shot_results_clean_2025-03-31_11-14-23.csv
Cleaned file already exists: cot_few-shot_results_clean_2025-03-30_13-24-37.csv
Cleaned file already exists: cot_few-shot_results_clean_2025-04-01_09-50-21.csv
Cleaned file already exists: cot_few-sho

### Self-Refine

In [67]:
raw_dir_self_refine = "results/self-refine_few-shot"
cleaned_dir_self_refine = "results/self-refine_few-shot_clean"
process_files(raw_dir_self_refine, cleaned_dir_self_refine, "self-refine_few-shot_results", clean_self_refine)

Cleaned file already exists: self-refine_few-shot_results_clean_2025-04-01_04-17-46.csv
Cleaned file already exists: self-refine_few-shot_results_clean_2025-03-31_18-12-53.csv
Cleaned file already exists: self-refine_few-shot_results_clean_2025-03-27_15-08-17.csv
Cleaned file already exists: self-refine_few-shot_results_clean_2025-03-30_19-59-42.csv
Cleaned file already exists: self-refine_few-shot_results_clean_2025-03-30_17-20-05.csv
Cleaned file already exists: self-refine_few-shot_results_clean_2025-04-01_09-24-29.csv
Cleaned file already exists: self-refine_few-shot_results_clean_2025-03-28_12-33-28.csv
Cleaned file already exists: self-refine_few-shot_results_clean_2025-03-31_23-15-22.csv
Cleaned file already exists: self-refine_few-shot_results_clean_2025-03-30_21-17-41.csv
Cleaned file already exists: self-refine_few-shot_results_clean_2025-03-30_14-31-20.csv
Cleaned file already exists: self-refine_few-shot_results_clean_2025-03-31_11-23-02.csv
Cleaned file already exists: sel