# Statistical Significance Testing (PRE-Decision)

This notebook performs rigorous statistical tests to validate that our model performance is:
1. **Significantly better than chance** (permutation test)
2. **Reliable** (bootstrap confidence intervals)
3. **Significantly different between models** (McNemar's test)

In [6]:
import numpy as np
import pandas as pd
import pickle
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import accuracy_score, f1_score
from sklearn.impute import SimpleImputer
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

sns.set_style('whitegrid')
np.random.seed(42)

## Load PRE-Decision Results

In [7]:
# Load features
with open('../../data/results/features_PRE/extracted_features_PRE.pkl', 'rb') as f:
    feature_data = pickle.load(f)

data = feature_data['merged_df']

print(f"Total trials: {len(data)}")
print(f"Subjects: {data['subject_id'].nunique()}")
print(f"Outcome distribution: {data['outcome'].value_counts().to_dict()}")

Total trials: 12511
Subjects: 97
Outcome distribution: {1: 8238, 0: 4273}


## 1. Permutation Test: Better Than Chance?

**Null hypothesis:** Model accuracy = 50% (chance)

**Method:** Shuffle labels 1000 times, recompute accuracy, compare to observed

In [8]:
def permutation_test(X, y, subjects, n_permutations=1000, model_name="Model"):
    """
    Permutation test to assess if accuracy is significantly > chance.
    """
    print(f"\n{'='*70}")
    print(f"Permutation Test: {model_name}")
    print(f"{'='*70}")
    
    # Get true accuracy
    logo = LeaveOneGroupOut()
    true_accs = []
    
    for train_idx, test_idx in logo.split(X, y, subjects):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model = RandomForestClassifier(n_estimators=100, max_depth=5, 
                                      min_samples_split=10, min_samples_leaf=5, 
                                      random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        true_accs.append(accuracy_score(y_test, y_pred))
    
    true_accuracy = np.mean(true_accs)
    print(f"True Accuracy: {true_accuracy:.4f}")
    
    # Permutation null distribution
    null_accs = []
    
    for i in tqdm(range(n_permutations), desc="Permutations"):
        # Shuffle labels
        y_shuffled = np.random.permutation(y)
        
        # Recompute accuracy with shuffled labels
        perm_accs = []
        for train_idx, test_idx in logo.split(X, y_shuffled, subjects):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y_shuffled[train_idx], y_shuffled[test_idx]
            
            model = RandomForestClassifier(n_estimators=100, max_depth=5,
                                          min_samples_split=10, min_samples_leaf=5,
                                          random_state=i)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            perm_accs.append(accuracy_score(y_test, y_pred))
        
        null_accs.append(np.mean(perm_accs))
    
    # Compute p-value
    p_value = np.mean(np.array(null_accs) >= true_accuracy)
    
    print(f"\nNull Distribution Mean: {np.mean(null_accs):.4f}")
    print(f"Null Distribution Std: {np.std(null_accs):.4f}")
    print(f"\np-value: {p_value:.4f}")
    
    if p_value < 0.001:
        print(f"✓ {model_name} accuracy is HIGHLY SIGNIFICANT (p < 0.001)")
    elif p_value < 0.01:
        print(f"✓ {model_name} accuracy is VERY SIGNIFICANT (p < 0.01)")
    elif p_value < 0.05:
        print(f"✓ {model_name} accuracy is SIGNIFICANT (p < 0.05)")
    else:
        print(f"✗ {model_name} accuracy is NOT SIGNIFICANT (p = {p_value:.4f})")
    
    return {
        'true_accuracy': true_accuracy,
        'null_mean': np.mean(null_accs),
        'null_std': np.std(null_accs),
        'p_value': p_value,
        'null_distribution': null_accs
    }

In [None]:
# Define feature groups (from extraction notebook)
gaze_features = [
    'gaze_valid_pct', 'gaze_x_mean', 'gaze_x_std', 'gaze_y_mean', 'gaze_y_std',
    'screen_x_mean', 'screen_x_std', 'screen_y_mean', 'screen_y_std',
    'gaze_velocity_mean', 'gaze_velocity_std', 'gaze_velocity_max',
    'gaze_acceleration_mean', 'gaze_acceleration_std',
    'fixation_ratio', 'saccade_ratio', 'saccade_count',
    'gaze_dispersion_x', 'gaze_dispersion_y', 'gaze_path_length'
]

# Filter available features
available_gaze = [f for f in gaze_features if f in data.columns]

# Prepare data
X_gaze = SimpleImputer(strategy='mean').fit_transform(data[available_gaze])
y = data['outcome'].values
subjects = data['subject_id'].values

# Run permutation test on gaze model (best performing)
perm_results = permutation_test(X_gaze, y, subjects, 
                               n_permutations=1000, 
                               model_name="Gaze Model (PRE)")


Permutation Test: Gaze Model (PRE)
True Accuracy: 0.6566


Permutations:   0%|          | 3/1000 [06:49<38:31:57, 139.13s/it]

In [None]:
# Visualize permutation test
plt.figure(figsize=(10, 6))
plt.hist(perm_results['null_distribution'], bins=50, alpha=0.7, 
         color='gray', edgecolor='black', label='Null Distribution')
plt.axvline(perm_results['true_accuracy'], color='red', linewidth=3, 
           label=f"True Accuracy = {perm_results['true_accuracy']:.4f}")
plt.axvline(perm_results['null_mean'], color='blue', linewidth=2, 
           linestyle='--', label=f"Null Mean = {perm_results['null_mean']:.4f}")
plt.xlabel('Accuracy')
plt.ylabel('Frequency')
plt.title(f"Permutation Test: Gaze Model (PRE)\np-value = {perm_results['p_value']:.4f}")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 2. Bootstrap Confidence Intervals

**Goal:** Estimate 95% CI for accuracy and F1-score

**Method:** Resample subjects with replacement 1000 times

In [None]:
def bootstrap_ci(X, y, subjects, n_bootstrap=1000, model_name="Model"):
    """
    Bootstrap confidence intervals for accuracy and F1-score.
    """
    print(f"\n{'='*70}")
    print(f"Bootstrap Confidence Intervals: {model_name}")
    print(f"{'='*70}")
    
    unique_subjects = np.unique(subjects)
    n_subjects = len(unique_subjects)
    
    boot_accs = []
    boot_f1s = []
    
    for i in tqdm(range(n_bootstrap), desc="Bootstrap iterations"):
        # Resample subjects with replacement
        boot_subjects = np.random.choice(unique_subjects, size=n_subjects, replace=True)
        
        # Create bootstrap sample
        boot_mask = np.isin(subjects, boot_subjects)
        X_boot = X[boot_mask]
        y_boot = y[boot_mask]
        subjects_boot = subjects[boot_mask]
        
        # LOSO CV on bootstrap sample
        logo = LeaveOneGroupOut()
        fold_accs = []
        fold_f1s = []
        
        for train_idx, test_idx in logo.split(X_boot, y_boot, subjects_boot):
            X_train, X_test = X_boot[train_idx], X_boot[test_idx]
            y_train, y_test = y_boot[train_idx], y_boot[test_idx]
            
            model = RandomForestClassifier(n_estimators=100, max_depth=5,
                                          min_samples_split=10, min_samples_leaf=5,
                                          random_state=i)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            fold_accs.append(accuracy_score(y_test, y_pred))
            fold_f1s.append(f1_score(y_test, y_pred, average='weighted'))
        
        boot_accs.append(np.mean(fold_accs))
        boot_f1s.append(np.mean(fold_f1s))
    
    # Compute 95% CI
    acc_ci = np.percentile(boot_accs, [2.5, 97.5])
    f1_ci = np.percentile(boot_f1s, [2.5, 97.5])
    
    print(f"\nAccuracy: {np.mean(boot_accs):.4f} (95% CI: {acc_ci[0]:.4f}-{acc_ci[1]:.4f})")
    print(f"F1-Score: {np.mean(boot_f1s):.4f} (95% CI: {f1_ci[0]:.4f}-{f1_ci[1]:.4f})")
    
    return {
        'accuracy_mean': np.mean(boot_accs),
        'accuracy_ci': acc_ci,
        'f1_mean': np.mean(boot_f1s),
        'f1_ci': f1_ci,
        'boot_accs': boot_accs,
        'boot_f1s': boot_f1s
    }

In [None]:
# Run bootstrap for gaze model
boot_results = bootstrap_ci(X_gaze, y, subjects, 
                           n_bootstrap=1000, 
                           model_name="Gaze Model (PRE)")

In [None]:
# Visualize bootstrap distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy distribution
axes[0].hist(boot_results['boot_accs'], bins=30, alpha=0.7, 
            color='steelblue', edgecolor='black')
axes[0].axvline(boot_results['accuracy_mean'], color='red', linewidth=2, 
               label=f"Mean = {boot_results['accuracy_mean']:.4f}")
axes[0].axvline(boot_results['accuracy_ci'][0], color='orange', 
               linewidth=2, linestyle='--', label=f"95% CI")
axes[0].axvline(boot_results['accuracy_ci'][1], color='orange', 
               linewidth=2, linestyle='--')
axes[0].set_xlabel('Accuracy')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Bootstrap Distribution: Accuracy')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# F1-Score distribution
axes[1].hist(boot_results['boot_f1s'], bins=30, alpha=0.7, 
            color='coral', edgecolor='black')
axes[1].axvline(boot_results['f1_mean'], color='red', linewidth=2,
               label=f"Mean = {boot_results['f1_mean']:.4f}")
axes[1].axvline(boot_results['f1_ci'][0], color='orange', 
               linewidth=2, linestyle='--', label=f"95% CI")
axes[1].axvline(boot_results['f1_ci'][1], color='orange', 
               linewidth=2, linestyle='--')
axes[1].set_xlabel('F1-Score')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Bootstrap Distribution: F1-Score')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. McNemar's Test: Compare Models

**Goal:** Test if differences between models are statistically significant

**Comparisons:**
1. Weighted Fusion vs Behavior Only
2. Weighted Fusion vs Gaze Only  
3. Weighted Fusion vs Average Fusion

In [None]:
from statsmodels.stats.contingency_tables import mcnemar

def mcnemar_test(y_true, pred_model1, pred_model2, model1_name, model2_name):
    """
    McNemar's test to compare two models.
    """
    # Create contingency table
    # Format: [[both_correct, model1_correct_model2_wrong],
    #          [model1_wrong_model2_correct, both_wrong]]
    
    both_correct = np.sum((pred_model1 == y_true) & (pred_model2 == y_true))
    model1_correct = np.sum((pred_model1 == y_true) & (pred_model2 != y_true))
    model2_correct = np.sum((pred_model1 != y_true) & (pred_model2 == y_true))
    both_wrong = np.sum((pred_model1 != y_true) & (pred_model2 != y_true))
    
    contingency_table = np.array([[both_correct, model1_correct],
                                  [model2_correct, both_wrong]])
    
    # Run McNemar's test
    result = mcnemar(contingency_table, exact=False, correction=True)
    
    print(f"\n{'='*70}")
    print(f"McNemar's Test: {model1_name} vs {model2_name}")
    print(f"{'='*70}")
    print(f"\nContingency Table:")
    print(f"  Both correct: {both_correct}")
    print(f"  Only {model1_name} correct: {model1_correct}")
    print(f"  Only {model2_name} correct: {model2_correct}")
    print(f"  Both wrong: {both_wrong}")
    print(f"\nTest statistic: {result.statistic:.4f}")
    print(f"p-value: {result.pvalue:.4f}")
    
    if result.pvalue < 0.001:
        print(f"✓ Difference is HIGHLY SIGNIFICANT (p < 0.001)")
    elif result.pvalue < 0.01:
        print(f"✓ Difference is VERY SIGNIFICANT (p < 0.01)")
    elif result.pvalue < 0.05:
        print(f"✓ Difference is SIGNIFICANT (p < 0.05)")
    else:
        print(f"✗ Difference is NOT SIGNIFICANT (p = {result.pvalue:.4f})")
    
    return {
        'statistic': result.statistic,
        'pvalue': result.pvalue,
        'contingency_table': contingency_table
    }

In [None]:
# Get predictions from different models
# (You'll need to run the late fusion model first to get these)

# Example - you'd load actual predictions from your fusion results
print("Note: Load actual model predictions from late_fusion_model_PRE.ipynb results")
print("This is a template showing the methodology.")

# Placeholder for demonstration
# y_true = fusion_results['y_true']
# pred_weighted = fusion_results['weighted']['predictions']
# pred_behavior = fusion_results['behavior']['predictions']
# pred_gaze = fusion_results['gaze']['predictions']
# pred_average = fusion_results['average']['predictions']

# mcnemar_1 = mcnemar_test(y_true, pred_weighted, pred_behavior, 
#                         "Weighted Fusion", "Behavior Only")
# mcnemar_2 = mcnemar_test(y_true, pred_weighted, pred_gaze,
#                         "Weighted Fusion", "Gaze Only")
# mcnemar_3 = mcnemar_test(y_true, pred_weighted, pred_average,
#                         "Weighted Fusion", "Average Fusion")

## Summary Table: All Statistical Tests

In [None]:
# Create summary table
summary_df = pd.DataFrame({
    'Test': [
        'Permutation Test',
        'Bootstrap CI (Accuracy)',
        'Bootstrap CI (F1-Score)'
    ],
    'Result': [
        f"{perm_results['true_accuracy']:.4f} (p = {perm_results['p_value']:.4f})",
        f"{boot_results['accuracy_mean']:.4f} ({boot_results['accuracy_ci'][0]:.4f}-{boot_results['accuracy_ci'][1]:.4f})",
        f"{boot_results['f1_mean']:.4f} ({boot_results['f1_ci'][0]:.4f}-{boot_results['f1_ci'][1]:.4f})"
    ],
    'Interpretation': [
        'Significantly > chance' if perm_results['p_value'] < 0.05 else 'Not significant',
        '95% confidence interval',
        '95% confidence interval'
    ]
})

print("\n" + "="*70)
print("STATISTICAL TESTING SUMMARY (PRE-DECISION)")
print("="*70)
print(summary_df.to_string(index=False))

# Save summary
summary_df.to_csv('../../data/results/analysis_outputs_PRE/statistical_testing_summary_PRE.csv', index=False)
print("\n✓ Saved summary to: statistical_testing_summary_PRE.csv")

## Save All Results

In [None]:
# Save detailed results
stat_results = {
    'permutation_test': perm_results,
    'bootstrap_ci': boot_results,
    'summary': summary_df
}

with open('../../data/results/analysis_outputs_PRE/statistical_testing_results_PRE.pkl', 'wb') as f:
    pickle.dump(stat_results, f)

print("✓ Saved all statistical test results")