In [77]:
import pandas as pd
import numpy as np
from scipy import stats
import os 

def perform_t_test_and_save(model_name, sae_path, output_csv_path):
    # Load data from text files
    df = pd.read_csv(sae_path, sep=";")
    df['ukGT_usGT_score'] =  pd.to_numeric(df['ukGT_usGT_score'], errors='coerce')
    df = df[df['ukGT_usGT_score'] < 0.9]
    df = df.dropna()

    # Convert dataframes to numpy arrays
    sample_us = df.us_score.to_numpy()
    sample_gt = df.ukGT_usGT_score.to_numpy()
    sample_uk = df.uk_score.to_numpy()
    
    # Perform KS tests
    result_gt_uk = stats.ttest_ind(sample_uk, sample_gt, alternative='greater')
    result_gt_us = stats.ttest_ind(sample_us, sample_gt, alternative='greater')
    
    # Prepare data for CSV
    data = [
        [model_name, "GroundTruth_UK", result_gt_uk.statistic, result_gt_uk.pvalue],
        [model_name, "GroundTruth_US", result_gt_us.statistic, result_gt_us.pvalue]
    ]
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=["Model", "Metric", "Statistic", "P-Value"])
    
    # Check if the CSV file already exists to decide on adding a header
    file_exists = os.path.isfile(output_csv_path)
    
    # Save to CSV, append if file exists, include header if file does not exist
    df.to_csv(output_csv_path, mode='a', index=False, header=not file_exists)
    
    print(f"Results saved to {output_csv_path}")


In [82]:
perform_t_test_and_save(
    "Llama13B",
    "/Users/nils/AutoPersonalizationBench/case1_CulturalSensitivity/results/results_llama13B.csv",
    "t_test_results_case1.csv"
)
perform_t_test_and_save(
    "Llama7B",
    "/Users/nils/AutoPersonalizationBench/case1_CulturalSensitivity/results/results_llama7B.csv",
    "t_test_results_case1.csv"
)
perform_t_test_and_save(
    "Vicuna13B",
    "/Users/nils/AutoPersonalizationBench/case1_CulturalSensitivity/results/results_vicuna13B.csv",
    "t_test_results_case1.csv"
)
perform_t_test_and_save(
    "Vicuna7B",
    "/Users/nils/AutoPersonalizationBench/case1_CulturalSensitivity/results/results_vicuna7B.csv",
    "t_test_results_case1.csv"
)
perform_t_test_and_save(
    "Alpaca",
    "/Users/nils/AutoPersonalizationBench/case1_CulturalSensitivity/results/results_alpaca.csv",
    "t_test_results_case1.csv"
)
perform_t_test_and_save(
    "GPT-4",
    "/Users/nils/AutoPersonalizationBench/case1_CulturalSensitivity/results/gpt_results.csv",
    "t_test_results_case1.csv"
)

perform_t_test_and_save(
    "Gemini",
    "/Users/nils/AutoPersonalizationBench/case1_CulturalSensitivity/results/gemini_results.csv",
    "t_test_results_case1.csv"
)

perform_t_test_and_save(
    "Llama-70B",
    "/Users/nils/AutoPersonalizationBench/case1_CulturalSensitivity/results/llama_results.csv",
    "t_test_results_case1.csv"
)

Results saved to t_test_results_case1.csv
Results saved to t_test_results_case1.csv
Results saved to t_test_results_case1.csv
Results saved to t_test_results_case1.csv
Results saved to t_test_results_case1.csv
Results saved to t_test_results_case1.csv
Results saved to t_test_results_case1.csv
Results saved to t_test_results_case1.csv


In [69]:
df = pd.read_csv('/Users/nils/AutoPersonalizationBench/case1_CulturalSensitivity/results/results_alpaca.csv', sep=";")
df['ukGT_usGT_score'] =  pd.to_numeric(df['ukGT_usGT_score'], errors='coerce')
df = df[df['ukGT_usGT_score'] < 0.9]

# Convert dataframes to numpy arrays
sample_us = df.us_score.to_numpy()
sample_gt = df.ukGT_usGT_score.to_numpy()
sample_uk = df.uk_score.to_numpy()

# Perform KS tests
result_gt_uk = stats.ttest_ind(sample_uk, sample_gt)
result_gt_us = stats.ttest_ind(sample_us, sample_gt)

In [75]:
print(np.isnan(sample_us).any(), np.isnan(sample_gt).any(), np.isnan(sample_uk).any())


True False True
False False False


In [76]:
nan_rows_specific = df[['us_score', 'ukGT_usGT_score', 'uk_score']].isna().any(axis=1)
print(df[nan_rows_specific])

     Unnamed: 0.2  Unnamed: 0.1  Unnamed: 0  \
3               3             3           3   
182           182           182         182   
656           656           656         656   
764           764           764         764   

                                              question  \
3    Do you think that using military force against...   
182  Overall, are you satisfied or dissatisfied wit...   
656  Please tell me for each of the following state...   
764  Please tell me for each of the following thing...   

                                            selections  \
3    {'United States': [0.22916666666666666, 0.4583...   
182  {'United States': [0.32323232323232326, 0.6464...   
656  {'United States': [0.424, 0.129, 0.098, 0.055,...   
764  {'United States': [0.0, 0.022000000000000002, ...   

                                               options  \
3    ['Often be justified', 'Sometimes be justified...   
182        ['Satisfied', 'Dissatisfied', 'DK/Refused']   
656  ['N