In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
import pandas as pd
from scipy.stats import ttest_ind, mannwhitneyu, chi2_contingency
from statsmodels.stats.diagnostic import lilliefors

In [3]:
# Load additional information
additional_info = pd.read_csv("../data/raw/additional_participants_information.csv")
additional_info.head(2)

Unnamed: 0,participant name,gender,age,height,weight,foot length,foot width,faller,low stability
0,dida,female,79,156.0,67.0,28.0,11.5,0,0
1,ronald,male,80,176.0,89.0,32.0,12.0,0,0


In [4]:
additional_info["faller"].value_counts()

faller
0    60
1     5
Name: count, dtype: int64

In [5]:
additional_info["low stability"].value_counts()

low stability
0    55
1    10
Name: count, dtype: int64

In [6]:
# --- Inputs you already have ---
df = additional_info.copy()

# Define cohorts
group_students = df[df["participant name"].astype(str).str.isnumeric()].copy()
df_older = df[~df["participant name"].astype(str).str.isnumeric()].copy()

numeric_vars = ["age", "height", "weight", "foot length", "foot width"]

def _mean_sem(s: pd.Series) -> str:
    s = s.dropna()
    return f"{s.mean():.2f} ± {s.sem():.2f}" if len(s) else "NA"

def _compare_two_groups(df_sub: pd.DataFrame,group_col: str,group0_val,group1_val,label0: str,label1: str,numeric_vars: list) -> pd.DataFrame:
    """Builds a comparison table between two groups in df_sub based on group_col."""
    g0 = df_sub[df_sub[group_col] == group0_val]
    g1 = df_sub[df_sub[group_col] == group1_val]

    results = []
    for var in numeric_vars:
        v0 = g0[var].dropna()
        v1 = g1[var].dropna()

        # Normality (Lilliefors)
        normal_0 = lilliefors(v0)[1] > 0.05 if len(v0) >= 5 else False
        normal_1 = lilliefors(v1)[1] > 0.05 if len(v1) >= 5 else False

        if normal_0 and normal_1 and len(v0) > 1 and len(v1) > 1:
            _, pval = ttest_ind(v0, v1, equal_var=False)
            test_used = "t-test"
        else:
            # Use two-sided Mann–Whitney when normality doubtful or small n
            if len(v0) > 0 and len(v1) > 0:
                _, pval = mannwhitneyu(v0, v1, alternative="two-sided")
            else:
                pval = float("nan")
            test_used = "Mann–Whitney U"

        results.append({
            "Variable": var,
            f"{label0} (Mean ± SEM)": _mean_sem(v0),
            f"{label1} (Mean ± SEM)": _mean_sem(v1),
            "Test Used": test_used,
            "P-Value": f"{pval:.4f}" if pd.notna(pval) else "NA"
        })

    # Gender (categorical)
    def gender_count(group):
        return (group["gender"] == "female").sum(), (group["gender"] == "male").sum()

    female0, male0 = gender_count(g0)
    female1, male1 = gender_count(g1)

    # Chi-square on the 2x2 table (gender x group)
    cont = pd.crosstab(df_sub["gender"], df_sub[group_col])
    try:
        chi2, p_Chi, _, _ = chi2_contingency(cont)
        pcat = f"{p_Chi:.4f}"
    except Exception:
        pcat = "NA"

    results.append({
        "Variable": "gender (female / male)",
        f"{label0} (Mean ± SEM)": f"{female0} / {male0}",
        f"{label1} (Mean ± SEM)": f"{female1} / {male1}",
        "Test Used": "Chi-Square",
        "P-Value": pcat
    })

    return pd.DataFrame(results)

def _students_summary_table(group_students: pd.DataFrame, numeric_vars: list) -> pd.DataFrame:
    """One-column descriptive table for students."""
    rows = []
    for var in numeric_vars:
        vals = group_students[var].dropna()
        rows.append({
            "Variable": var,
            "Students (Mean ± SEM)": _mean_sem(vals),
            "N": int(vals.shape[0])
        })

    # Gender counts
    female = (group_students["gender"] == "female").sum()
    male   = (group_students["gender"] == "male").sum()
    rows.append({
        "Variable": "gender (female / male)",
        "Students (Mean ± SEM)": f"{female} / {male}",
        "N": int(group_students.shape[0])
    })

    return pd.DataFrame(rows)

### Low vs Non-Low stability (older adults only)

In [7]:
table_low_stability = _compare_two_groups(
    df_sub=df_older,
    group_col="low stability",
    group0_val=0,
    group1_val=1,
    label0="Non-Low Stability",
    label1="Low Stability",
    numeric_vars=numeric_vars
)
table_low_stability

Unnamed: 0,Variable,Non-Low Stability (Mean ± SEM),Low Stability (Mean ± SEM),Test Used,P-Value
0,age,74.96 ± 0.81,73.30 ± 1.57,Mann–Whitney U,0.1424
1,height,164.61 ± 1.62,163.40 ± 3.42,t-test,0.7547
2,weight,72.14 ± 2.48,77.10 ± 3.12,t-test,0.2267
3,foot length,28.54 ± 0.42,28.35 ± 0.70,t-test,0.8225
4,foot width,11.05 ± 0.23,10.85 ± 0.33,t-test,0.6149
5,gender (female / male),14 / 14,6 / 4,Chi-Square,0.8613


### Non-fallers vs Fallers (older adults only)

In [8]:
# ----------  ----------
table_fallers = _compare_two_groups(
    df_sub=df_older,
    group_col="faller",
    group0_val=0,
    group1_val=1,
    label0="Non-fallers",
    label1="Fallers",
    numeric_vars=numeric_vars
)
table_fallers

Unnamed: 0,Variable,Non-fallers (Mean ± SEM),Fallers (Mean ± SEM),Test Used,P-Value
0,age,74.45 ± 0.72,75.00 ± 3.03,Mann–Whitney U,0.6959
1,height,164.79 ± 1.53,161.00 ± 5.07,t-test,0.5079
2,weight,72.97 ± 2.28,76.60 ± 2.62,t-test,0.3173
3,foot length,28.58 ± 0.38,27.90 ± 1.04,Mann–Whitney U,0.3287
4,foot width,11.08 ± 0.20,10.50 ± 0.52,t-test,0.3501
5,gender (female / male),17 / 16,3 / 2,Chi-Square,1.0


### Students only

In [9]:
# ---------- Table 3: Students only ----------
table_students = _students_summary_table(group_students, numeric_vars)
table_students

Unnamed: 0,Variable,Students (Mean ± SEM),N
0,age,23.78 ± 0.23,27
1,height,172.76 ± 1.86,27
2,weight,66.13 ± 2.62,27
3,foot length,28.48 ± 0.44,27
4,foot width,10.61 ± 0.16,27
5,gender (female / male),16 / 11,27
