In [11]:
import os
import numpy as np
from scipy.stats import wilcoxon

def get_objective_value(filepath):
    """
    Reads a single-line CSV, splits it by commas, and returns the second
    value (the objective score) as a float.
    """
    try:
        with open(filepath, 'r') as f:
            # Read the first (and only) line of the file
            line = f.readline()
            if not line:
                print(f"Warning: File {filepath} is empty.")
                return None
            
            # Split the line into parts by the comma
            parts = line.strip().split(',')
            
            # Ensure the file has at least two columns
            if len(parts) >= 2:
                # The objective value is the second column (index 1)
                return float(parts[1])
            else:
                print(f"Warning: File {filepath} has fewer than 2 columns.")
                return None
    except (IOError, ValueError) as e:
        print(f"Error reading file {filepath}: {e}")
        return None

def main():
    """Main function to perform the paired analysis."""
    # --- Configuration ---
    # Updated the LLM directory path as requested
    llm_dir = r'C:\Users\aidan\Desktop\Last 4 runs LLM algo\results'
    original_dir = r'C:\Users\aidan\Desktop\lima_vns_bmssc-main-original-copy-init_soln\results'
    
    delta = 0.5  # Non-inferiority margin in percentage points
    alpha = 0.05 # Significance level

    paired_differences_pct = []

    # Get the list of run files from the LLM directory
    try:
        # Assuming the original directory is the source of truth for filenames
        run_files = [f for f in os.listdir(original_dir) if f.endswith('.csv')]
        if not run_files:
            print(f"Error: No .csv files found in the original directory: '{original_dir}'")
            return
    except FileNotFoundError:
        print(f"Error: Directory '{original_dir}' not found.")
        return

    # Loop through each run file to find its pair and calculate the difference
    for filename in run_files:
        llm_filepath = os.path.join(llm_dir, filename)
        original_filepath = os.path.join(original_dir, filename)

        if not os.path.exists(llm_filepath):
            print(f"Warning: Corresponding file for {filename} not found in {llm_dir}. Skipping.")
            continue

        llm_val = get_objective_value(llm_filepath)
        original_val = get_objective_value(original_filepath)

        if llm_val is not None and original_val is not None:
            if original_val == 0:
                print(f"Warning: Original value is zero for {filename}. Cannot calculate percent difference. Skipping.")
                continue
            # Calculate the percentage difference
            percent_diff = 100.0 * (llm_val - original_val) / original_val
            paired_differences_pct.append(percent_diff)

    if not paired_differences_pct:
        print("Error: No valid data pairs were found to perform the test.")
        return

    # --- Perform the Statistical Test ---
    differences = np.array(paired_differences_pct)
    n = len(differences)

    # Wilcoxon signed-rank non-inferiority at margin δ (one-sided)
    # H0: median(d) >= δ   vs   H1: median(d) < δ
    stat, p_value = wilcoxon(differences - delta, alternative="less")

    decision = "REJECT H0 (Conclusion: LLM is non-inferior)" if p_value < alpha else "FAIL to reject H0 (Conclusion: Cannot claim non-inferiority)"

    print("--- Wilcoxon Signed-Rank Non-Inferiority Test ---")
    print(f"Number of paired runs (n): {n}")
    print(f"Non-inferiority margin (δ): {delta:.1f}%")
    print(f"\nHypotheses:")
    print(f"  H0: median(difference) >= {delta:.1f}%")
    print(f"  H1: median(difference) < {delta:.1f}%")
    print(f"\nResults:")
    print(f"  Wilcoxon Statistic (W): {stat:.1f}")
    print(f"  p-value: {p_value:.6f}")
    print(f"\nDecision at α = {alpha}: {decision}")

if __name__ == "__main__":
    main()

--- Wilcoxon Signed-Rank Non-Inferiority Test ---
Number of paired runs (n): 158
Non-inferiority margin (δ): 0.5%

Hypotheses:
  H0: median(difference) >= 0.5%
  H1: median(difference) < 0.5%

Results:
  Wilcoxon Statistic (W): 5200.0
  p-value: 0.030140

Decision at α = 0.05: REJECT H0 (Conclusion: LLM is non-inferior)
