In [35]:
import pandas as pd
import numpy as np
from scipy import stats
import os 

def perform_ks_tests_and_save(model_name, sae_path, esl_path, aae_path, output_csv_path):
    # Load data from text files
    SAE = pd.read_csv(sae_path, sep="	", header=None)
    ESL = pd.read_csv(esl_path, sep="	", header=None)
    AAE = pd.read_csv(aae_path, sep="	", header=None)
    
    # Convert dataframes to numpy arrays
    sample_sae = SAE[1].to_numpy()
    sample_esl = ESL[1].to_numpy()
    sample_aae = AAE[1].to_numpy()
    
    # Perform KS tests
    result_sae_esl = stats.ks_2samp(sample_sae, sample_esl)
    result_sae_aae = stats.ks_2samp(sample_sae, sample_aae)
    
    # Prepare data for CSV
    data = [
        [model_name, "AES_Score", "SAE-ESL", result_sae_esl.statistic, result_sae_esl.pvalue],
        [model_name, "AES_Score", "SAE-AAE", result_sae_aae.statistic, result_sae_aae.pvalue]
    ]
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=["Model", "Metric", "Pair", "Statistic", "P-Value"])
    
    # Check if the CSV file already exists to decide on adding a header
    file_exists = os.path.isfile(output_csv_path)
    
    # Save to CSV, append if file exists, include header if file does not exist
    df.to_csv(output_csv_path, mode='a', index=False, header=not file_exists)
    
    print(f"Results saved to {output_csv_path}")


In [44]:
perform_ks_tests_and_save(
    "Alpaca",
    "case2_EducationalDisparity/results/MultiScaleBertAESResults/alpaca_base_results.txt",
    "case2_EducationalDisparity/results/MultiScaleBertAESResults/alpaca_ESL_results.txt",
    "case2_EducationalDisparity/results/MultiScaleBertAESResults/alpaca_AAE_results.txt",
    "ks_test_results.csv"
)


Results saved to ks_test_results.csv
