In [3]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [4]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, zscore

In [5]:
older_adults_trial_level = pd.read_csv("../../data/processed/older_adults/filtered_measurements.csv")
older_adults_average_level = pd.read_csv("../../data/processed/older_adults/average_measurements.csv")
students_trial_level = pd.read_csv("../../data/processed/students/measurements.csv")
students_average_level = pd.read_csv("../../data/processed/students/average_measurements.csv")

In [2]:
def check_ccc_assumptions(df, average_level=False, save_results=False):
    """
    Check CCC assumptions: linearity, homoscedasticity, and outliers
    for each (state, metric) using Force_Plate and ZED_COM.

    Parameters:
    - df: DataFrame with ['participant name', 'state', 'device', 'metric', 'value' or 'average_value']
    - average_level: bool, use 'average_value' if True, else 'value'
    - save_results: bool, if True, saves summary to assumptions/ccc_assumptions_*.csv
    """
    value_col = 'average_value' if average_level else 'value'
    df_filtered = df[df['device'].isin(['Force_Plate', 'ZED_COM'])]

    results = []

    print(f"{'State':<10} {'Metric':<25} {'n':<4} {'Linearity (r)':<18} {'Homoscedasticity':<20} {'Outliers':<10} {'Summary'}")
    print("-" * 90)

    for (state, metric), group in df_filtered.groupby(['state', 'metric']):
        pivot = group.pivot_table(index='participant name', columns='device', values=value_col).dropna()

        if pivot.shape[0] < 3:
            print(f"{state:<10} {metric:<25} {pivot.shape[0]:<4} {'Too few':<18} {'Too few':<20} {'Too few':<10} Too few samples")
            results.append({
                'state': state, 'metric': metric, 'n': pivot.shape[0],
                'linearity_r': 'Too few', 'homoscedasticity': 'Too few', 'outliers': 'Too few',
                'summary': 'Too few samples'
            })
            continue

        x = pivot['Force_Plate']
        y = pivot['ZED_COM']
        n = len(x)

        # Linearity
        r_val, _ = pearsonr(x, y)
        linearity_status = 'Linear' if abs(r_val) > 0.7 else 'Non-linear'

        # Homoscedasticity
        mean_vals = (x + y) / 2
        abs_diff = np.abs(x - y)
        r_hetero, p_hetero = pearsonr(mean_vals, abs_diff)
        hetero_status = 'Yes' if p_hetero < 0.05 and abs(r_hetero) > 0.3 else 'No'

        # Outlier detection using z-scores of differences
        z_scores = zscore(x - y)
        num_outliers = np.sum(np.abs(z_scores) > 3)
        outlier_status = 'Yes' if num_outliers > 0 else 'No'

        # Summary
        if linearity_status == 'Linear' and hetero_status == 'No' and outlier_status == 'No':
            summary = 'Assumptions met'
        else:
            summary = ', '.join(filter(lambda x: x != 'No', [linearity_status, 'Heterosced' if hetero_status == 'Yes' else '', 'Outliers' if outlier_status == 'Yes' else '']))

        print(f"{state:<10} {metric:<25} {n:<4} {r_val:.3f} ({linearity_status})   {hetero_status:<20} {outlier_status:<10} {summary}")

        results.append({
            'state': state,
            'metric': metric,
            'n': n,
            'linearity_r': r_val,
            'homoscedasticity': hetero_status,
            'outliers': outlier_status,
            'summary': summary
        })

    # Save results if needed
    if save_results:
        os.makedirs("assumptions", exist_ok=True)
        filename = f"ccc_assumptions_{'avg' if average_level else 'trial'}.csv"
        pd.DataFrame(results).to_csv(os.path.join("assumptions", filename), index=False)
        print(f"\nSaved CCC assumptions summary to: assumptions/{filename}")


In [6]:
check_ccc_assumptions(older_adults_trial_level)

State      Metric                    n    Linearity (r)      Homoscedasticity     Outliers   Summary
------------------------------------------------------------------------------------------
closed     AP MAD                    37   0.803 (Linear)   No                   Yes        Linear, , Outliers
closed     AP Max abs dev            37   0.676 (Non-linear)   Yes                  Yes        Non-linear, Heterosced, Outliers
closed     AP Range                  37   0.683 (Non-linear)   Yes                  No         Non-linear, Heterosced, 
closed     AP Variance               37   0.836 (Linear)   Yes                  Yes        Linear, Heterosced, Outliers
closed     Ellipse area              37   0.907 (Linear)   Yes                  Yes        Linear, Heterosced, Outliers
closed     ML MAD                    37   0.917 (Linear)   No                   No         Assumptions met
closed     ML Max abs dev            37   0.722 (Linear)   Yes                  Yes        Linear, Hete

In [7]:
check_ccc_assumptions(older_adults_average_level, average_level=True)

State      Metric                    n    Linearity (r)      Homoscedasticity     Outliers   Summary
------------------------------------------------------------------------------------------
closed     AP MAD                    37   0.803 (Linear)   No                   Yes        Linear, , Outliers
closed     AP Max abs dev            37   0.676 (Non-linear)   Yes                  Yes        Non-linear, Heterosced, Outliers
closed     AP Range                  37   0.683 (Non-linear)   Yes                  No         Non-linear, Heterosced, 
closed     AP Variance               37   0.836 (Linear)   Yes                  Yes        Linear, Heterosced, Outliers
closed     Ellipse area              37   0.907 (Linear)   Yes                  Yes        Linear, Heterosced, Outliers
closed     ML MAD                    37   0.917 (Linear)   No                   No         Assumptions met
closed     ML Max abs dev            37   0.722 (Linear)   Yes                  Yes        Linear, Hete